From 228cae30b117c2493f69ad3c195341cd6ec8d430 Mon Sep 17 00:00:00 2001 From: djm <> Date: Sat, 13 Oct 2012 21:23:50 +0000 Subject: import OpenSSL-1.0.1c --- src/lib/libcrypto/aes/aes.h | 5 + src/lib/libcrypto/aes/aes_core.c | 12 +- src/lib/libcrypto/aes/aes_misc.c | 21 + src/lib/libcrypto/aes/asm/aes-586.pl | 14 +- src/lib/libcrypto/aes/asm/aes-armv4.pl | 182 +- src/lib/libcrypto/aes/asm/aes-mips.pl | 1611 +++++++++++++ src/lib/libcrypto/aes/asm/aes-parisc.pl | 1021 ++++++++ src/lib/libcrypto/aes/asm/aes-ppc.pl | 444 ++-- src/lib/libcrypto/aes/asm/aes-s390x.pl | 1071 ++++++++- src/lib/libcrypto/aes/asm/aes-sparcv9.pl | 3 +- src/lib/libcrypto/aes/asm/aes-x86_64.pl | 45 +- src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl | 1249 ++++++++++ src/lib/libcrypto/aes/asm/aesni-x86.pl | 2189 +++++++++++++++++ src/lib/libcrypto/aes/asm/aesni-x86_64.pl | 2498 +++++++++++++++++-- src/lib/libcrypto/aes/asm/bsaes-x86_64.pl | 3044 ++++++++++++++++++++++++ src/lib/libcrypto/aes/asm/vpaes-x86.pl | 903 +++++++ src/lib/libcrypto/aes/asm/vpaes-x86_64.pl | 1206 ++++++++++ src/lib/libcrypto/arm_arch.h | 51 + src/lib/libcrypto/armcap.c | 80 + src/lib/libcrypto/armv4cpuid.S | 154 ++ src/lib/libcrypto/asn1/a_d2i_fp.c | 54 +- src/lib/libcrypto/asn1/a_digest.c | 6 +- src/lib/libcrypto/asn1/a_int.c | 4 +- src/lib/libcrypto/asn1/a_sign.c | 111 +- src/lib/libcrypto/asn1/a_verify.c | 77 +- src/lib/libcrypto/asn1/ameth_lib.c | 12 +- src/lib/libcrypto/asn1/asn1.h | 8 +- src/lib/libcrypto/asn1/asn1_err.c | 5 +- src/lib/libcrypto/asn1/asn1_locl.h | 11 + src/lib/libcrypto/asn1/asn_mime.c | 23 +- src/lib/libcrypto/asn1/n_pkey.c | 38 +- src/lib/libcrypto/asn1/p5_pbev2.c | 143 +- src/lib/libcrypto/asn1/t_crl.c | 3 +- src/lib/libcrypto/asn1/t_x509.c | 55 +- src/lib/libcrypto/asn1/tasn_prn.c | 12 +- src/lib/libcrypto/asn1/x_algor.c | 14 + src/lib/libcrypto/asn1/x_name.c | 3 +- src/lib/libcrypto/asn1/x_pubkey.c | 11 +- src/lib/libcrypto/bf/bf_skey.c | 8 + src/lib/libcrypto/bf/blowfish.h | 4 +- src/lib/libcrypto/bio/b_sock.c | 2 +- src/lib/libcrypto/bio/bio.h | 70 +- src/lib/libcrypto/bio/bio_err.c | 3 +- src/lib/libcrypto/bio/bio_lib.c | 28 +- src/lib/libcrypto/bio/bss_bio.c | 18 +- src/lib/libcrypto/bio/bss_dgram.c | 996 ++++++++ src/lib/libcrypto/bn/asm/armv4-gf2m.pl | 278 +++ src/lib/libcrypto/bn/asm/armv4-mont.pl | 23 +- src/lib/libcrypto/bn/asm/ia64-mont.pl | 851 +++++++ src/lib/libcrypto/bn/asm/mips-mont.pl | 426 ++++ src/lib/libcrypto/bn/asm/mips.pl | 2585 ++++++++++++++++++++ src/lib/libcrypto/bn/asm/modexp512-x86_64.pl | 1496 ++++++++++++ src/lib/libcrypto/bn/asm/parisc-mont.pl | 993 ++++++++ src/lib/libcrypto/bn/asm/ppc-mont.pl | 107 +- src/lib/libcrypto/bn/asm/ppc.pl | 43 +- src/lib/libcrypto/bn/asm/ppc64-mont.pl | 338 ++- src/lib/libcrypto/bn/asm/s390x-gf2m.pl | 221 ++ src/lib/libcrypto/bn/asm/s390x-mont.pl | 102 +- src/lib/libcrypto/bn/asm/x86-gf2m.pl | 313 +++ src/lib/libcrypto/bn/asm/x86_64-gf2m.pl | 389 +++ src/lib/libcrypto/bn/asm/x86_64-mont.pl | 1486 +++++++++++- src/lib/libcrypto/bn/asm/x86_64-mont5.pl | 1070 +++++++++ src/lib/libcrypto/bn/bn.h | 15 + src/lib/libcrypto/bn/bn_div.c | 272 +-- src/lib/libcrypto/bn/bn_exp.c | 240 +- src/lib/libcrypto/bn/bn_gf2m.c | 114 +- src/lib/libcrypto/bn/bn_lcl.h | 23 +- src/lib/libcrypto/bn/bn_lib.c | 19 - src/lib/libcrypto/bn/bn_mont.c | 116 +- src/lib/libcrypto/bn/bn_nist.c | 338 ++- src/lib/libcrypto/bn/bn_print.c | 19 + src/lib/libcrypto/bn/bn_shift.c | 27 +- src/lib/libcrypto/buffer/buf_str.c | 99 +- src/lib/libcrypto/buffer/buffer.c | 75 +- src/lib/libcrypto/camellia/asm/cmll-x86.pl | 6 +- src/lib/libcrypto/camellia/camellia.h | 4 + src/lib/libcrypto/camellia/cmll_locl.h | 5 +- src/lib/libcrypto/camellia/cmll_misc.c | 3 +- src/lib/libcrypto/cast/c_skey.c | 9 +- src/lib/libcrypto/cast/cast.h | 4 +- src/lib/libcrypto/cmac/cm_ameth.c | 97 + src/lib/libcrypto/cmac/cm_pmeth.c | 224 ++ src/lib/libcrypto/cmac/cmac.c | 308 +++ src/lib/libcrypto/cmac/cmac.h | 82 + src/lib/libcrypto/cms/cms.h | 22 + src/lib/libcrypto/cms/cms_asn1.c | 9 + src/lib/libcrypto/cms/cms_enc.c | 60 +- src/lib/libcrypto/cms/cms_env.c | 22 +- src/lib/libcrypto/cms/cms_err.c | 13 +- src/lib/libcrypto/cms/cms_lcl.h | 12 + src/lib/libcrypto/cms/cms_lib.c | 3 +- src/lib/libcrypto/cms/cms_pwri.c | 454 ++++ src/lib/libcrypto/cms/cms_sd.c | 3 +- src/lib/libcrypto/cms/cms_smime.c | 61 +- src/lib/libcrypto/comp/c_rle.c | 4 +- src/lib/libcrypto/cpt_err.c | 4 +- src/lib/libcrypto/cryptlib.c | 40 +- src/lib/libcrypto/cryptlib.h | 2 +- src/lib/libcrypto/crypto.h | 29 + src/lib/libcrypto/des/des.h | 3 + src/lib/libcrypto/des/set_key.c | 9 + src/lib/libcrypto/dh/dh.h | 20 + src/lib/libcrypto/dh/dh_ameth.c | 1 + src/lib/libcrypto/dh/dh_err.c | 7 +- src/lib/libcrypto/dh/dh_gen.c | 17 + src/lib/libcrypto/dh/dh_key.c | 33 +- src/lib/libcrypto/dh/dh_lib.c | 15 +- src/lib/libcrypto/doc/EVP_DigestInit.pod | 66 +- src/lib/libcrypto/dsa/dsa.h | 20 + src/lib/libcrypto/dsa/dsa_ameth.c | 47 + src/lib/libcrypto/dsa/dsa_asn1.c | 40 +- src/lib/libcrypto/dsa/dsa_err.c | 7 +- src/lib/libcrypto/dsa/dsa_gen.c | 35 +- src/lib/libcrypto/dsa/dsa_key.c | 16 + src/lib/libcrypto/dsa/dsa_lib.c | 22 +- src/lib/libcrypto/dsa/dsa_locl.h | 1 + src/lib/libcrypto/dsa/dsa_ossl.c | 16 +- src/lib/libcrypto/dsa/dsa_pmeth.c | 6 +- src/lib/libcrypto/dsa/dsa_sign.c | 50 +- src/lib/libcrypto/dsa/dsa_vrf.c | 29 +- src/lib/libcrypto/dso/dso_dlfcn.c | 3 +- src/lib/libcrypto/ec/ec.h | 69 +- src/lib/libcrypto/ec/ec2_mult.c | 4 + src/lib/libcrypto/ec/ec2_oct.c | 407 ++++ src/lib/libcrypto/ec/ec2_smpl.c | 351 +-- src/lib/libcrypto/ec/ec_ameth.c | 1 + src/lib/libcrypto/ec/ec_asn1.c | 24 +- src/lib/libcrypto/ec/ec_curve.c | 197 +- src/lib/libcrypto/ec/ec_cvt.c | 28 +- src/lib/libcrypto/ec/ec_err.c | 20 +- src/lib/libcrypto/ec/ec_key.c | 102 +- src/lib/libcrypto/ec/ec_lcl.h | 55 +- src/lib/libcrypto/ec/ec_lib.c | 80 +- src/lib/libcrypto/ec/ec_oct.c | 199 ++ src/lib/libcrypto/ec/ec_pmeth.c | 1 + src/lib/libcrypto/ec/eck_prn.c | 3 +- src/lib/libcrypto/ec/ecp_mont.c | 14 +- src/lib/libcrypto/ec/ecp_nist.c | 13 +- src/lib/libcrypto/ec/ecp_nistp224.c | 1658 +++++++++++++ src/lib/libcrypto/ec/ecp_nistp256.c | 2171 +++++++++++++++++ src/lib/libcrypto/ec/ecp_nistp521.c | 2025 ++++++++++++++++ src/lib/libcrypto/ec/ecp_nistputil.c | 197 ++ src/lib/libcrypto/ec/ecp_oct.c | 433 ++++ src/lib/libcrypto/ec/ecp_smpl.c | 379 +-- src/lib/libcrypto/ecdh/ecdh.h | 2 + src/lib/libcrypto/ecdh/ech_err.c | 4 +- src/lib/libcrypto/ecdh/ech_lib.c | 20 + src/lib/libcrypto/ecdh/ech_locl.h | 8 + src/lib/libcrypto/ecdsa/ecdsa.h | 2 + src/lib/libcrypto/ecdsa/ecs_err.c | 4 +- src/lib/libcrypto/ecdsa/ecs_lib.c | 21 +- src/lib/libcrypto/ecdsa/ecs_locl.h | 8 + src/lib/libcrypto/ecdsa/ecs_ossl.c | 5 +- src/lib/libcrypto/engine/eng_all.c | 9 + src/lib/libcrypto/engine/eng_fat.c | 3 +- src/lib/libcrypto/engine/engine.h | 9 + src/lib/libcrypto/err/err.c | 13 +- src/lib/libcrypto/err/err.h | 3 +- src/lib/libcrypto/err/err_all.c | 7 + src/lib/libcrypto/evp/bio_md.c | 11 +- src/lib/libcrypto/evp/digest.c | 28 +- src/lib/libcrypto/evp/e_aes.c | 1273 +++++++++- src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c | 406 ++++ src/lib/libcrypto/evp/e_des3.c | 3 + src/lib/libcrypto/evp/e_null.c | 4 +- src/lib/libcrypto/evp/e_rc2.c | 3 +- src/lib/libcrypto/evp/e_rc4.c | 1 + src/lib/libcrypto/evp/e_rc4_hmac_md5.c | 298 +++ src/lib/libcrypto/evp/evp.h | 94 +- src/lib/libcrypto/evp/evp_enc.c | 95 +- src/lib/libcrypto/evp/evp_err.c | 20 +- src/lib/libcrypto/evp/evp_key.c | 27 +- src/lib/libcrypto/evp/evp_lib.c | 4 + src/lib/libcrypto/evp/evp_locl.h | 40 + src/lib/libcrypto/evp/evp_pbe.c | 5 + src/lib/libcrypto/evp/m_dss.c | 2 + src/lib/libcrypto/evp/m_dss1.c | 3 + src/lib/libcrypto/evp/m_ecdsa.c | 3 + src/lib/libcrypto/evp/m_md4.c | 2 + src/lib/libcrypto/evp/m_md5.c | 1 + src/lib/libcrypto/evp/m_ripemd.c | 1 + src/lib/libcrypto/evp/m_sha1.c | 5 + src/lib/libcrypto/evp/m_wp.c | 1 + src/lib/libcrypto/evp/names.c | 5 + src/lib/libcrypto/evp/p5_crpt.c | 33 +- src/lib/libcrypto/evp/p5_crpt2.c | 89 +- src/lib/libcrypto/evp/p_open.c | 3 +- src/lib/libcrypto/evp/p_seal.c | 3 +- src/lib/libcrypto/evp/p_sign.c | 10 +- src/lib/libcrypto/evp/p_verify.c | 10 +- src/lib/libcrypto/evp/pmeth_gn.c | 5 +- src/lib/libcrypto/evp/pmeth_lib.c | 55 +- src/lib/libcrypto/hmac/hm_ameth.c | 2 +- src/lib/libcrypto/hmac/hm_pmeth.c | 14 +- src/lib/libcrypto/hmac/hmac.c | 37 + src/lib/libcrypto/ia64cpuid.S | 2 +- src/lib/libcrypto/idea/i_cbc.c | 168 ++ src/lib/libcrypto/idea/i_cfb64.c | 122 + src/lib/libcrypto/idea/i_ecb.c | 85 + src/lib/libcrypto/idea/i_ofb64.c | 111 + src/lib/libcrypto/idea/i_skey.c | 164 ++ src/lib/libcrypto/idea/idea.h | 3 + src/lib/libcrypto/idea/idea_lcl.h | 215 ++ src/lib/libcrypto/md4/md4.h | 3 + src/lib/libcrypto/md4/md4_dgst.c | 5 +- src/lib/libcrypto/md5/md5.h | 3 + src/lib/libcrypto/md5/md5_dgst.c | 3 +- src/lib/libcrypto/modes/asm/ghash-alpha.pl | 451 ++++ src/lib/libcrypto/modes/asm/ghash-armv4.pl | 429 ++++ src/lib/libcrypto/modes/asm/ghash-ia64.pl | 463 ++++ src/lib/libcrypto/modes/asm/ghash-parisc.pl | 730 ++++++ src/lib/libcrypto/modes/asm/ghash-s390x.pl | 262 ++ src/lib/libcrypto/modes/asm/ghash-sparcv9.pl | 330 +++ src/lib/libcrypto/modes/asm/ghash-x86.pl | 1342 +++++++++++ src/lib/libcrypto/modes/asm/ghash-x86_64.pl | 805 +++++++ src/lib/libcrypto/modes/cbc128.c | 10 +- src/lib/libcrypto/modes/ccm128.c | 441 ++++ src/lib/libcrypto/modes/cfb128.c | 11 +- src/lib/libcrypto/modes/ctr128.c | 92 +- src/lib/libcrypto/modes/cts128.c | 226 +- src/lib/libcrypto/modes/gcm128.c | 1757 ++++++++++++++ src/lib/libcrypto/modes/modes.h | 76 + src/lib/libcrypto/modes/modes_lcl.h | 131 + src/lib/libcrypto/modes/ofb128.c | 11 +- src/lib/libcrypto/modes/xts128.c | 187 ++ src/lib/libcrypto/o_init.c | 34 +- src/lib/libcrypto/objects/obj_mac.num | 27 + src/lib/libcrypto/objects/obj_xref.c | 9 +- src/lib/libcrypto/objects/obj_xref.h | 2 + src/lib/libcrypto/objects/obj_xref.txt | 4 + src/lib/libcrypto/objects/objects.txt | 41 +- src/lib/libcrypto/ocsp/ocsp_lib.c | 3 +- src/lib/libcrypto/opensslv.h | 6 +- src/lib/libcrypto/ossl_typ.h | 2 + src/lib/libcrypto/pariscid.pl | 224 ++ src/lib/libcrypto/pem/pvkfmt.c | 58 +- src/lib/libcrypto/perlasm/ppc-xlate.pl | 13 +- src/lib/libcrypto/perlasm/x86_64-xlate.pl | 221 +- src/lib/libcrypto/perlasm/x86asm.pl | 55 +- src/lib/libcrypto/perlasm/x86gas.pl | 34 +- src/lib/libcrypto/pkcs12/p12_decr.c | 9 +- src/lib/libcrypto/pkcs12/p12_key.c | 16 +- src/lib/libcrypto/pkcs12/p12_kiss.c | 2 +- src/lib/libcrypto/pkcs12/p12_mutl.c | 12 +- src/lib/libcrypto/pkcs7/pk7_doit.c | 101 +- src/lib/libcrypto/pkcs7/pk7_smime.c | 25 +- src/lib/libcrypto/ppccap.c | 115 + src/lib/libcrypto/ppccpuid.pl | 48 +- src/lib/libcrypto/rand/rand.h | 9 + src/lib/libcrypto/rand/rand_err.c | 6 +- src/lib/libcrypto/rand/rand_lib.c | 119 + src/lib/libcrypto/rand/randfile.c | 2 +- src/lib/libcrypto/rc2/rc2.h | 4 +- src/lib/libcrypto/rc2/rc2_skey.c | 8 + src/lib/libcrypto/rc4/asm/rc4-586.pl | 162 +- src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl | 631 +++++ src/lib/libcrypto/rc4/asm/rc4-parisc.pl | 313 +++ src/lib/libcrypto/rc4/asm/rc4-s390x.pl | 47 +- src/lib/libcrypto/rc4/asm/rc4-x86_64.pl | 290 ++- src/lib/libcrypto/rc4/rc4.h | 1 + src/lib/libcrypto/rc4/rc4_skey.c | 36 +- src/lib/libcrypto/ripemd/ripemd.h | 3 + src/lib/libcrypto/ripemd/rmd_dgst.c | 3 +- src/lib/libcrypto/rsa/rsa.h | 79 + src/lib/libcrypto/rsa/rsa_ameth.c | 351 ++- src/lib/libcrypto/rsa/rsa_asn1.c | 10 + src/lib/libcrypto/rsa/rsa_crpt.c | 257 ++ src/lib/libcrypto/rsa/rsa_err.c | 21 +- src/lib/libcrypto/rsa/rsa_gen.c | 15 + src/lib/libcrypto/rsa/rsa_lib.c | 172 +- src/lib/libcrypto/rsa/rsa_oaep.c | 6 +- src/lib/libcrypto/rsa/rsa_pmeth.c | 154 +- src/lib/libcrypto/rsa/rsa_pss.c | 81 +- src/lib/libcrypto/rsa/rsa_sign.c | 33 + src/lib/libcrypto/s390xcap.c | 12 +- src/lib/libcrypto/s390xcpuid.S | 17 +- src/lib/libcrypto/sha/asm/sha1-586.pl | 1107 ++++++++- src/lib/libcrypto/sha/asm/sha1-alpha.pl | 322 +++ src/lib/libcrypto/sha/asm/sha1-armv4-large.pl | 38 +- src/lib/libcrypto/sha/asm/sha1-ia64.pl | 192 +- src/lib/libcrypto/sha/asm/sha1-mips.pl | 354 +++ src/lib/libcrypto/sha/asm/sha1-parisc.pl | 259 ++ src/lib/libcrypto/sha/asm/sha1-ppc.pl | 83 +- src/lib/libcrypto/sha/asm/sha1-s390x.pl | 50 +- src/lib/libcrypto/sha/asm/sha1-x86_64.pl | 1185 +++++++-- src/lib/libcrypto/sha/asm/sha256-586.pl | 52 +- src/lib/libcrypto/sha/asm/sha256-armv4.pl | 55 +- src/lib/libcrypto/sha/asm/sha512-armv4.pl | 357 ++- src/lib/libcrypto/sha/asm/sha512-mips.pl | 455 ++++ src/lib/libcrypto/sha/asm/sha512-parisc.pl | 791 ++++++ src/lib/libcrypto/sha/asm/sha512-ppc.pl | 114 +- src/lib/libcrypto/sha/asm/sha512-s390x.pl | 63 +- src/lib/libcrypto/sha/asm/sha512-sparcv9.pl | 6 +- src/lib/libcrypto/sha/asm/sha512-x86_64.pl | 86 +- src/lib/libcrypto/sha/sha.h | 14 + src/lib/libcrypto/sha/sha1dgst.c | 1 + src/lib/libcrypto/sha/sha256.c | 4 +- src/lib/libcrypto/sha/sha512.c | 54 +- src/lib/libcrypto/sha/sha_locl.h | 6 +- src/lib/libcrypto/sparcv9cap.c | 4 +- src/lib/libcrypto/stack/safestack.h | 138 +- src/lib/libcrypto/ts/ts.h | 3 - src/lib/libcrypto/ts/ts_rsp_verify.c | 9 +- src/lib/libcrypto/ui/ui.h | 2 +- src/lib/libcrypto/ui/ui_openssl.c | 2 +- src/lib/libcrypto/whrlpool/whrlpool.h | 3 + src/lib/libcrypto/whrlpool/wp_block.c | 4 +- src/lib/libcrypto/whrlpool/wp_dgst.c | 3 +- src/lib/libcrypto/x509/x509.h | 11 + src/lib/libcrypto/x509/x509_cmp.c | 27 +- src/lib/libcrypto/x509/x509_lu.c | 2 +- src/lib/libcrypto/x509/x509_vfy.c | 5 - src/lib/libcrypto/x509/x509type.c | 32 +- src/lib/libcrypto/x509/x_all.c | 19 + src/lib/libcrypto/x509v3/v3_skey.c | 3 +- src/lib/libcrypto/x86_64cpuid.pl | 87 +- src/lib/libcrypto/x86cpuid.pl | 78 +- src/lib/libssl/bio_ssl.c | 2 + src/lib/libssl/d1_both.c | 178 +- src/lib/libssl/d1_clnt.c | 194 +- src/lib/libssl/d1_enc.c | 2 +- src/lib/libssl/d1_lib.c | 54 +- src/lib/libssl/d1_pkt.c | 167 +- src/lib/libssl/d1_srtp.c | 493 ++++ src/lib/libssl/d1_srvr.c | 186 +- src/lib/libssl/dtls1.h | 18 +- src/lib/libssl/s23_clnt.c | 111 +- src/lib/libssl/s23_srvr.c | 52 +- src/lib/libssl/s3_both.c | 36 +- src/lib/libssl/s3_clnt.c | 397 ++- src/lib/libssl/s3_lib.c | 1222 ++++++++-- src/lib/libssl/s3_pkt.c | 77 +- src/lib/libssl/s3_srvr.c | 546 ++++- src/lib/libssl/srtp.h | 145 ++ src/lib/libssl/ssl.h | 313 ++- src/lib/libssl/ssl2.h | 4 + src/lib/libssl/ssl3.h | 32 +- src/lib/libssl/ssl_algs.c | 9 + src/lib/libssl/ssl_asn1.c | 50 + src/lib/libssl/ssl_cert.c | 21 +- src/lib/libssl/ssl_ciph.c | 133 +- src/lib/libssl/ssl_err.c | 36 + src/lib/libssl/ssl_lib.c | 247 +- src/lib/libssl/ssl_locl.h | 75 +- src/lib/libssl/ssl_sess.c | 160 +- src/lib/libssl/ssl_txt.c | 8 + src/lib/libssl/t1_clnt.c | 21 +- src/lib/libssl/t1_enc.c | 309 ++- src/lib/libssl/t1_lib.c | 941 +++++++- src/lib/libssl/t1_meth.c | 22 +- src/lib/libssl/t1_srvr.c | 21 +- src/lib/libssl/test/CAss.cnf | 2 +- src/lib/libssl/test/P1ss.cnf | 2 +- src/lib/libssl/test/P2ss.cnf | 2 +- src/lib/libssl/test/Uss.cnf | 4 +- src/lib/libssl/test/pkits-test.pl | 9 + src/lib/libssl/test/test.cnf | 2 +- src/lib/libssl/test/testssl | 10 + src/lib/libssl/tls1.h | 209 +- 359 files changed, 63248 insertions(+), 4583 deletions(-) create mode 100644 src/lib/libcrypto/aes/asm/aes-mips.pl create mode 100644 src/lib/libcrypto/aes/asm/aes-parisc.pl create mode 100644 src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl create mode 100644 src/lib/libcrypto/aes/asm/aesni-x86.pl create mode 100644 src/lib/libcrypto/aes/asm/bsaes-x86_64.pl create mode 100644 src/lib/libcrypto/aes/asm/vpaes-x86.pl create mode 100644 src/lib/libcrypto/aes/asm/vpaes-x86_64.pl create mode 100644 src/lib/libcrypto/arm_arch.h create mode 100644 src/lib/libcrypto/armcap.c create mode 100644 src/lib/libcrypto/armv4cpuid.S create mode 100644 src/lib/libcrypto/bn/asm/armv4-gf2m.pl create mode 100644 src/lib/libcrypto/bn/asm/ia64-mont.pl create mode 100644 src/lib/libcrypto/bn/asm/mips-mont.pl create mode 100644 src/lib/libcrypto/bn/asm/mips.pl create mode 100644 src/lib/libcrypto/bn/asm/modexp512-x86_64.pl create mode 100644 src/lib/libcrypto/bn/asm/parisc-mont.pl create mode 100644 src/lib/libcrypto/bn/asm/s390x-gf2m.pl create mode 100644 src/lib/libcrypto/bn/asm/x86-gf2m.pl create mode 100644 src/lib/libcrypto/bn/asm/x86_64-gf2m.pl create mode 100755 src/lib/libcrypto/bn/asm/x86_64-mont5.pl create mode 100644 src/lib/libcrypto/cmac/cm_ameth.c create mode 100644 src/lib/libcrypto/cmac/cm_pmeth.c create mode 100644 src/lib/libcrypto/cmac/cmac.c create mode 100644 src/lib/libcrypto/cmac/cmac.h create mode 100644 src/lib/libcrypto/cms/cms_pwri.c create mode 100644 src/lib/libcrypto/ec/ec2_oct.c create mode 100644 src/lib/libcrypto/ec/ec_oct.c create mode 100644 src/lib/libcrypto/ec/ecp_nistp224.c create mode 100644 src/lib/libcrypto/ec/ecp_nistp256.c create mode 100644 src/lib/libcrypto/ec/ecp_nistp521.c create mode 100644 src/lib/libcrypto/ec/ecp_nistputil.c create mode 100644 src/lib/libcrypto/ec/ecp_oct.c create mode 100644 src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c create mode 100644 src/lib/libcrypto/evp/e_rc4_hmac_md5.c create mode 100644 src/lib/libcrypto/idea/i_cbc.c create mode 100644 src/lib/libcrypto/idea/i_cfb64.c create mode 100644 src/lib/libcrypto/idea/i_ecb.c create mode 100644 src/lib/libcrypto/idea/i_ofb64.c create mode 100644 src/lib/libcrypto/idea/i_skey.c create mode 100644 src/lib/libcrypto/idea/idea_lcl.h create mode 100644 src/lib/libcrypto/modes/asm/ghash-alpha.pl create mode 100644 src/lib/libcrypto/modes/asm/ghash-armv4.pl create mode 100755 src/lib/libcrypto/modes/asm/ghash-ia64.pl create mode 100644 src/lib/libcrypto/modes/asm/ghash-parisc.pl create mode 100644 src/lib/libcrypto/modes/asm/ghash-s390x.pl create mode 100644 src/lib/libcrypto/modes/asm/ghash-sparcv9.pl create mode 100644 src/lib/libcrypto/modes/asm/ghash-x86.pl create mode 100644 src/lib/libcrypto/modes/asm/ghash-x86_64.pl create mode 100644 src/lib/libcrypto/modes/ccm128.c create mode 100644 src/lib/libcrypto/modes/gcm128.c create mode 100644 src/lib/libcrypto/modes/modes_lcl.h create mode 100644 src/lib/libcrypto/modes/xts128.c create mode 100644 src/lib/libcrypto/pariscid.pl create mode 100644 src/lib/libcrypto/ppccap.c create mode 100644 src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl create mode 100644 src/lib/libcrypto/rc4/asm/rc4-parisc.pl create mode 100644 src/lib/libcrypto/rsa/rsa_crpt.c create mode 100644 src/lib/libcrypto/sha/asm/sha1-alpha.pl create mode 100644 src/lib/libcrypto/sha/asm/sha1-mips.pl create mode 100644 src/lib/libcrypto/sha/asm/sha1-parisc.pl create mode 100644 src/lib/libcrypto/sha/asm/sha512-mips.pl create mode 100755 src/lib/libcrypto/sha/asm/sha512-parisc.pl create mode 100644 src/lib/libssl/d1_srtp.c create mode 100644 src/lib/libssl/srtp.h (limited to 'src/lib') diff --git a/src/lib/libcrypto/aes/aes.h b/src/lib/libcrypto/aes/aes.h index d2c99730fe..031abf01b5 100644 --- a/src/lib/libcrypto/aes/aes.h +++ b/src/lib/libcrypto/aes/aes.h @@ -90,6 +90,11 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, int AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key); +int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); +int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key); + void AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key); void AES_decrypt(const unsigned char *in, unsigned char *out, diff --git a/src/lib/libcrypto/aes/aes_core.c b/src/lib/libcrypto/aes/aes_core.c index a7ec54f4da..8f5210ac70 100644 --- a/src/lib/libcrypto/aes/aes_core.c +++ b/src/lib/libcrypto/aes/aes_core.c @@ -625,7 +625,7 @@ static const u32 rcon[] = { /** * Expand the cipher key into the encryption key schedule. */ -int AES_set_encrypt_key(const unsigned char *userKey, const int bits, +int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) { u32 *rk; @@ -726,7 +726,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, /** * Expand the cipher key into the decryption key schedule. */ -int AES_set_decrypt_key(const unsigned char *userKey, const int bits, +int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) { u32 *rk; @@ -734,7 +734,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, u32 temp; /* first, start with an encryption schedule */ - status = AES_set_encrypt_key(userKey, bits, key); + status = private_AES_set_encrypt_key(userKey, bits, key); if (status < 0) return status; @@ -1201,7 +1201,7 @@ static const u32 rcon[] = { /** * Expand the cipher key into the encryption key schedule. */ -int AES_set_encrypt_key(const unsigned char *userKey, const int bits, +int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) { u32 *rk; int i = 0; @@ -1301,7 +1301,7 @@ int AES_set_encrypt_key(const unsigned char *userKey, const int bits, /** * Expand the cipher key into the decryption key schedule. */ -int AES_set_decrypt_key(const unsigned char *userKey, const int bits, +int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, AES_KEY *key) { u32 *rk; @@ -1309,7 +1309,7 @@ int AES_set_decrypt_key(const unsigned char *userKey, const int bits, u32 temp; /* first, start with an encryption schedule */ - status = AES_set_encrypt_key(userKey, bits, key); + status = private_AES_set_encrypt_key(userKey, bits, key); if (status < 0) return status; diff --git a/src/lib/libcrypto/aes/aes_misc.c b/src/lib/libcrypto/aes/aes_misc.c index 4fead1b4c7..f083488ecb 100644 --- a/src/lib/libcrypto/aes/aes_misc.c +++ b/src/lib/libcrypto/aes/aes_misc.c @@ -50,6 +50,7 @@ */ #include +#include #include #include "aes_locl.h" @@ -62,3 +63,23 @@ const char *AES_options(void) { return "aes(partial)"; #endif } + +/* FIPS wrapper functions to block low level AES calls in FIPS mode */ + +int AES_set_encrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) + { +#ifdef OPENSSL_FIPS + fips_cipher_abort(AES); +#endif + return private_AES_set_encrypt_key(userKey, bits, key); + } + +int AES_set_decrypt_key(const unsigned char *userKey, const int bits, + AES_KEY *key) + { +#ifdef OPENSSL_FIPS + fips_cipher_abort(AES); +#endif + return private_AES_set_decrypt_key(userKey, bits, key); + } diff --git a/src/lib/libcrypto/aes/asm/aes-586.pl b/src/lib/libcrypto/aes/asm/aes-586.pl index aab40e6f1c..687ed811be 100644 --- a/src/lib/libcrypto/aes/asm/aes-586.pl +++ b/src/lib/libcrypto/aes/asm/aes-586.pl @@ -39,7 +39,7 @@ # but exhibits up to 10% improvement on other cores. # # Second version is "monolithic" replacement for aes_core.c, which in -# addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key. +# addition to AES_[de|en]crypt implements private_AES_set_[de|en]cryption_key. # This made it possible to implement little-endian variant of the # algorithm without modifying the base C code. Motivating factor for # the undertaken effort was that it appeared that in tight IA-32 @@ -2854,12 +2854,12 @@ sub enckey() &set_label("exit"); &function_end("_x86_AES_set_encrypt_key"); -# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, +# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) -&function_begin_B("AES_set_encrypt_key"); +&function_begin_B("private_AES_set_encrypt_key"); &call ("_x86_AES_set_encrypt_key"); &ret (); -&function_end_B("AES_set_encrypt_key"); +&function_end_B("private_AES_set_encrypt_key"); sub deckey() { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_; @@ -2916,9 +2916,9 @@ sub deckey() &mov (&DWP(4*$i,$key),$tp1); } -# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, +# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) -&function_begin_B("AES_set_decrypt_key"); +&function_begin_B("private_AES_set_decrypt_key"); &call ("_x86_AES_set_encrypt_key"); &cmp ("eax",0); &je (&label("proceed")); @@ -2974,7 +2974,7 @@ sub deckey() &jb (&label("permute")); &xor ("eax","eax"); # return success -&function_end("AES_set_decrypt_key"); +&function_end("private_AES_set_decrypt_key"); &asciz("AES for x86, CRYPTOGAMS by "); &asm_finish(); diff --git a/src/lib/libcrypto/aes/asm/aes-armv4.pl b/src/lib/libcrypto/aes/asm/aes-armv4.pl index c51ee1fbf6..86b86c4a0f 100644 --- a/src/lib/libcrypto/aes/asm/aes-armv4.pl +++ b/src/lib/libcrypto/aes/asm/aes-armv4.pl @@ -27,6 +27,11 @@ # Rescheduling for dual-issue pipeline resulted in 12% improvement on # Cortex A8 core and ~25 cycles per byte processed with 128-bit key. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~21.5 cycles per byte. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -46,6 +51,7 @@ $key="r11"; $rounds="r12"; $code=<<___; +#include "arm_arch.h" .text .code 32 @@ -166,7 +172,7 @@ AES_encrypt: mov $rounds,r0 @ inp mov $key,r2 sub $tbl,r3,#AES_encrypt-AES_Te @ Te - +#if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... ldrb $t2,[$rounds,#1] @@ -195,10 +201,33 @@ AES_encrypt: orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t3,lsl#24 - +#else + ldr $s0,[$rounds,#0] + ldr $s1,[$rounds,#4] + ldr $s2,[$rounds,#8] + ldr $s3,[$rounds,#12] +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif +#endif bl _armv4_AES_encrypt ldr $rounds,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif + str $s0,[$rounds,#0] + str $s1,[$rounds,#4] + str $s2,[$rounds,#8] + str $s3,[$rounds,#12] +#else mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t2,$s0,lsr#16 @ manner... mov $t3,$s0,lsr#8 @@ -227,11 +256,15 @@ AES_encrypt: strb $t2,[$rounds,#13] strb $t3,[$rounds,#14] strb $s3,[$rounds,#15] - +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size AES_encrypt,.-AES_encrypt .type _armv4_AES_encrypt,%function @@ -271,11 +304,11 @@ _armv4_AES_encrypt: and $i2,lr,$s2,lsr#16 @ i1 eor $t3,$t3,$i3,ror#8 and $i3,lr,$s2 - eor $s1,$s1,$t1,ror#24 ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8] + eor $s1,$s1,$t1,ror#24 + ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] mov $s2,$s2,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16] ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0] eor $s0,$s0,$i1,ror#16 ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24] @@ -284,16 +317,16 @@ _armv4_AES_encrypt: and $i2,lr,$s3,lsr#8 @ i1 eor $t3,$t3,$i3,ror#16 and $i3,lr,$s3,lsr#16 @ i2 - eor $s2,$s2,$t2,ror#16 ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0] + eor $s2,$s2,$t2,ror#16 + ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] mov $s3,$s3,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8] ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16] eor $s0,$s0,$i1,ror#24 - ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] - eor $s1,$s1,$i2,ror#16 ldr $i1,[$key],#16 + eor $s1,$s1,$i2,ror#16 + ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24] eor $s2,$s2,$i3,ror#8 ldr $t1,[$key,#-12] eor $s3,$s3,$t3,ror#8 @@ -333,11 +366,11 @@ _armv4_AES_encrypt: and $i2,lr,$s2,lsr#16 @ i1 eor $t3,$i3,$t3,lsl#8 and $i3,lr,$s2 - eor $s1,$t1,$s1,lsl#24 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8] + eor $s1,$t1,$s1,lsl#24 + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] mov $s2,$s2,lsr#24 - ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0] eor $s0,$i1,$s0,lsl#8 ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24] @@ -346,15 +379,15 @@ _armv4_AES_encrypt: and $i2,lr,$s3,lsr#8 @ i1 eor $t3,$i3,$t3,lsl#8 and $i3,lr,$s3,lsr#16 @ i2 - eor $s2,$t2,$s2,lsl#24 ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0] + eor $s2,$t2,$s2,lsl#24 + ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] mov $s3,$s3,lsr#24 - ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8] ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16] eor $s0,$i1,$s0,lsl#8 - ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] ldr $i1,[$key,#0] + ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24] eor $s1,$s1,$i2,lsl#8 ldr $t1,[$key,#4] eor $s2,$s2,$i3,lsl#16 @@ -371,10 +404,11 @@ _armv4_AES_encrypt: ldr pc,[sp],#4 @ pop and return .size _armv4_AES_encrypt,.-_armv4_AES_encrypt -.global AES_set_encrypt_key -.type AES_set_encrypt_key,%function +.global private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,%function .align 5 -AES_set_encrypt_key: +private_AES_set_encrypt_key: +_armv4_AES_set_encrypt_key: sub r3,pc,#8 @ AES_set_encrypt_key teq r0,#0 moveq r0,#-1 @@ -392,12 +426,13 @@ AES_set_encrypt_key: bne .Labrt .Lok: stmdb sp!,{r4-r12,lr} - sub $tbl,r3,#AES_set_encrypt_key-AES_Te-1024 @ Te4 + sub $tbl,r3,#_armv4_AES_set_encrypt_key-AES_Te-1024 @ Te4 mov $rounds,r0 @ inp mov lr,r1 @ bits mov $key,r2 @ key +#if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... ldrb $t2,[$rounds,#1] @@ -430,6 +465,22 @@ AES_set_encrypt_key: orr $s3,$s3,$t3,lsl#24 str $s2,[$key,#-8] str $s3,[$key,#-4] +#else + ldr $s0,[$rounds,#0] + ldr $s1,[$rounds,#4] + ldr $s2,[$rounds,#8] + ldr $s3,[$rounds,#12] +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif + str $s0,[$key],#16 + str $s1,[$key,#-12] + str $s2,[$key,#-8] + str $s3,[$key,#-4] +#endif teq lr,#128 bne .Lnot128 @@ -466,6 +517,7 @@ AES_set_encrypt_key: b .Ldone .Lnot128: +#if __ARM_ARCH__<7 ldrb $i2,[$rounds,#19] ldrb $t1,[$rounds,#18] ldrb $t2,[$rounds,#17] @@ -482,6 +534,16 @@ AES_set_encrypt_key: str $i2,[$key],#8 orr $i3,$i3,$t3,lsl#24 str $i3,[$key,#-4] +#else + ldr $i2,[$rounds,#16] + ldr $i3,[$rounds,#20] +#ifdef __ARMEL__ + rev $i2,$i2 + rev $i3,$i3 +#endif + str $i2,[$key],#8 + str $i3,[$key,#-4] +#endif teq lr,#192 bne .Lnot192 @@ -526,6 +588,7 @@ AES_set_encrypt_key: b .L192_loop .Lnot192: +#if __ARM_ARCH__<7 ldrb $i2,[$rounds,#27] ldrb $t1,[$rounds,#26] ldrb $t2,[$rounds,#25] @@ -542,6 +605,16 @@ AES_set_encrypt_key: str $i2,[$key],#8 orr $i3,$i3,$t3,lsl#24 str $i3,[$key,#-4] +#else + ldr $i2,[$rounds,#24] + ldr $i3,[$rounds,#28] +#ifdef __ARMEL__ + rev $i2,$i2 + rev $i3,$i3 +#endif + str $i2,[$key],#8 + str $i3,[$key,#-4] +#endif mov $rounds,#14 str $rounds,[$key,#240-32] @@ -606,14 +679,14 @@ AES_set_encrypt_key: .Labrt: tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -.size AES_set_encrypt_key,.-AES_set_encrypt_key +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key -.global AES_set_decrypt_key -.type AES_set_decrypt_key,%function +.global private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,%function .align 5 -AES_set_decrypt_key: +private_AES_set_decrypt_key: str lr,[sp,#-4]! @ push lr - bl AES_set_encrypt_key + bl _armv4_AES_set_encrypt_key teq r0,#0 ldrne lr,[sp],#4 @ pop lr bne .Labrt @@ -692,11 +765,15 @@ $code.=<<___; bne .Lmix mov r0,#0 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -.size AES_set_decrypt_key,.-AES_set_decrypt_key +#endif +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key .type AES_Td,%object .align 5 @@ -811,7 +888,7 @@ AES_decrypt: mov $rounds,r0 @ inp mov $key,r2 sub $tbl,r3,#AES_decrypt-AES_Td @ Td - +#if __ARM_ARCH__<7 ldrb $s0,[$rounds,#3] @ load input data in endian-neutral ldrb $t1,[$rounds,#2] @ manner... ldrb $t2,[$rounds,#1] @@ -840,10 +917,33 @@ AES_decrypt: orr $s3,$s3,$t1,lsl#8 orr $s3,$s3,$t2,lsl#16 orr $s3,$s3,$t3,lsl#24 - +#else + ldr $s0,[$rounds,#0] + ldr $s1,[$rounds,#4] + ldr $s2,[$rounds,#8] + ldr $s3,[$rounds,#12] +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif +#endif bl _armv4_AES_decrypt ldr $rounds,[sp],#4 @ pop out +#if __ARM_ARCH__>=7 +#ifdef __ARMEL__ + rev $s0,$s0 + rev $s1,$s1 + rev $s2,$s2 + rev $s3,$s3 +#endif + str $s0,[$rounds,#0] + str $s1,[$rounds,#4] + str $s2,[$rounds,#8] + str $s3,[$rounds,#12] +#else mov $t1,$s0,lsr#24 @ write output in endian-neutral mov $t2,$s0,lsr#16 @ manner... mov $t3,$s0,lsr#8 @@ -872,11 +972,15 @@ AES_decrypt: strb $t2,[$rounds,#13] strb $t3,[$rounds,#14] strb $s3,[$rounds,#15] - +#endif +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size AES_decrypt,.-AES_decrypt .type _armv4_AES_decrypt,%function @@ -916,11 +1020,11 @@ _armv4_AES_decrypt: and $i2,lr,$s2 @ i1 eor $t3,$i3,$t3,ror#8 and $i3,lr,$s2,lsr#16 - eor $s1,$s1,$t1,ror#8 ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8] + eor $s1,$s1,$t1,ror#8 + ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] mov $s2,$s2,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0] ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16] eor $s0,$s0,$i1,ror#16 ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24] @@ -929,22 +1033,22 @@ _armv4_AES_decrypt: and $i2,lr,$s3,lsr#8 @ i1 eor $t3,$i3,$t3,ror#8 and $i3,lr,$s3 @ i2 - eor $s2,$s2,$t2,ror#8 ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16] + eor $s2,$s2,$t2,ror#8 + ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] mov $s3,$s3,lsr#24 - ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8] ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0] eor $s0,$s0,$i1,ror#8 - ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] + ldr $i1,[$key],#16 eor $s1,$s1,$i2,ror#16 + ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24] eor $s2,$s2,$i3,ror#24 - ldr $i1,[$key],#16 - eor $s3,$s3,$t3,ror#8 ldr $t1,[$key,#-12] - ldr $t2,[$key,#-8] eor $s0,$s0,$i1 + ldr $t2,[$key,#-8] + eor $s3,$s3,$t3,ror#8 ldr $t3,[$key,#-4] and $i1,lr,$s0,lsr#16 eor $s1,$s1,$t1 @@ -985,11 +1089,11 @@ _armv4_AES_decrypt: and $i1,lr,$s2,lsr#8 @ i0 eor $t2,$t2,$i2,lsl#8 and $i2,lr,$s2 @ i1 - eor $t3,$t3,$i3,lsl#8 ldrb $i1,[$tbl,$i1] @ Td4[s2>>8] + eor $t3,$t3,$i3,lsl#8 + ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] and $i3,lr,$s2,lsr#16 - ldrb $i2,[$tbl,$i2] @ Td4[s2>>0] ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24] eor $s0,$s0,$i1,lsl#8 ldrb $i3,[$tbl,$i3] @ Td4[s2>>16] @@ -997,11 +1101,11 @@ _armv4_AES_decrypt: and $i1,lr,$s3,lsr#16 @ i0 eor $s2,$t2,$s2,lsl#16 and $i2,lr,$s3,lsr#8 @ i1 - eor $t3,$t3,$i3,lsl#16 ldrb $i1,[$tbl,$i1] @ Td4[s3>>16] + eor $t3,$t3,$i3,lsl#16 + ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] and $i3,lr,$s3 @ i2 - ldrb $i2,[$tbl,$i2] @ Td4[s3>>8] ldrb $i3,[$tbl,$i3] @ Td4[s3>>0] ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24] eor $s0,$s0,$i1,lsl#16 diff --git a/src/lib/libcrypto/aes/asm/aes-mips.pl b/src/lib/libcrypto/aes/asm/aes-mips.pl new file mode 100644 index 0000000000..2ce6deffc8 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-mips.pl @@ -0,0 +1,1611 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for MIPS + +# October 2010 +# +# Code uses 1K[+256B] S-box and on single-issue core [such as R5000] +# spends ~68 cycles per byte processed with 128-bit key. This is ~16% +# faster than gcc-generated code, which is not very impressive. But +# recall that compressed S-box requires extra processing, namely +# additional rotations. Rotations are implemented with lwl/lwr pairs, +# which is normally used for loading unaligned data. Another cool +# thing about this module is its endian neutrality, which means that +# it processes data without ever changing byte order... + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) +{ $big_endian=(unpack('L',pack('N',1))==1); } + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +my ($MSB,$LSB)=(0,3); # automatically converted to little-endian + +$code.=<<___; +.text +#ifdef OPENSSL_FIPSCANISTER +# include +#endif + +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif +.set noat +___ + +{{{ +my $FRAMESIZE=16*$SZREG; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +my ($inp,$out,$key,$Tbl,$s0,$s1,$s2,$s3)=($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7); +my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); +my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11) = map("\$$_",(12..23)); +my ($key0,$cnt)=($gp,$fp); + +# instuction ordering is "stolen" from output from MIPSpro assembler +# invoked with -mips3 -O3 arguments... +$code.=<<___; +.align 5 +.ent _mips_AES_encrypt +_mips_AES_encrypt: + .frame $sp,0,$ra + .set reorder + lw $t0,0($key) + lw $t1,4($key) + lw $t2,8($key) + lw $t3,12($key) + lw $cnt,240($key) + $PTR_ADD $key0,$key,16 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + sub $cnt,1 + _xtr $i0,$s1,16-2 +.Loop_enc: + _xtr $i1,$s2,16-2 + _xtr $i2,$s3,16-2 + _xtr $i3,$s0,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t0,3($i0) # Te1[s1>>16] + lwl $t1,3($i1) # Te1[s2>>16] + lwl $t2,3($i2) # Te1[s3>>16] + lwl $t3,3($i3) # Te1[s0>>16] + lwr $t0,2($i0) # Te1[s1>>16] + lwr $t1,2($i1) # Te1[s2>>16] + lwr $t2,2($i2) # Te1[s3>>16] + lwr $t3,2($i3) # Te1[s0>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t4,2($i0) # Te2[s2>>8] + lwl $t5,2($i1) # Te2[s3>>8] + lwl $t6,2($i2) # Te2[s0>>8] + lwl $t7,2($i3) # Te2[s1>>8] + lwr $t4,1($i0) # Te2[s2>>8] + lwr $t5,1($i1) # Te2[s3>>8] + lwr $t6,1($i2) # Te2[s0>>8] + lwr $t7,1($i3) # Te2[s1>>8] + + _xtr $i0,$s3,0-2 + _xtr $i1,$s0,0-2 + _xtr $i2,$s1,0-2 + _xtr $i3,$s2,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t8,1($i0) # Te3[s3] + lwl $t9,1($i1) # Te3[s0] + lwl $t10,1($i2) # Te3[s1] + lwl $t11,1($i3) # Te3[s2] + lwr $t8,0($i0) # Te3[s3] + lwr $t9,0($i1) # Te3[s0] + lwr $t10,0($i2) # Te3[s1] + lwr $t11,0($i3) # Te3[s2] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + lw $t4,0($i0) # Te0[s0>>24] + lw $t5,0($i1) # Te0[s1>>24] + lw $t6,0($i2) # Te0[s2>>24] + lw $t7,0($i3) # Te0[s3>>24] + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_enc + _xtr $i0,$s1,16-2 + + .set reorder + _xtr $i1,$s2,16-2 + _xtr $i2,$s3,16-2 + _xtr $i3,$s0,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t0,2($i0) # Te4[s1>>16] + lbu $t1,2($i1) # Te4[s2>>16] + lbu $t2,2($i2) # Te4[s3>>16] + lbu $t3,2($i3) # Te4[s0>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,2($i0) # Te4[s2>>8] + lbu $t5,2($i1) # Te4[s3>>8] + lbu $t6,2($i2) # Te4[s0>>8] + lbu $t7,2($i3) # Te4[s1>>8] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,2($i0) # Te4[s0>>24] + lbu $t9,2($i1) # Te4[s1>>24] + lbu $t10,2($i2) # Te4[s2>>24] + lbu $t11,2($i3) # Te4[s3>>24] + + _xtr $i0,$s3,0-2 + _xtr $i1,$s0,0-2 + _xtr $i2,$s1,0-2 + _xtr $i3,$s2,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins $t4,8 + _ins $t5,8 + _ins $t6,8 + _ins $t7,8 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,2($i0) # Te4[s3] + lbu $t5,2($i1) # Te4[s0] + lbu $t6,2($i2) # Te4[s1] + lbu $t7,2($i3) # Te4[s2] + + _ins $t8,24 + _ins $t9,24 + _ins $t10,24 + _ins $t11,24 + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + jr $ra +.end _mips_AES_encrypt + +.align 5 +.globl AES_encrypt +.ent AES_encrypt +AES_encrypt: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_encrypt +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + lwl $s0,0+$MSB($inp) + lwl $s1,4+$MSB($inp) + lwl $s2,8+$MSB($inp) + lwl $s3,12+$MSB($inp) + lwr $s0,0+$LSB($inp) + lwr $s1,4+$LSB($inp) + lwr $s2,8+$LSB($inp) + lwr $s3,12+$LSB($inp) + + bal _mips_AES_encrypt + + swr $s0,0+$LSB($out) + swr $s1,4+$LSB($out) + swr $s2,8+$LSB($out) + swr $s3,12+$LSB($out) + swl $s0,0+$MSB($out) + swl $s1,4+$MSB($out) + swl $s2,8+$MSB($out) + swl $s3,12+$MSB($out) + + .set noreorder + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_encrypt +___ + +$code.=<<___; +.align 5 +.ent _mips_AES_decrypt +_mips_AES_decrypt: + .frame $sp,0,$ra + .set reorder + lw $t0,0($key) + lw $t1,4($key) + lw $t2,8($key) + lw $t3,12($key) + lw $cnt,240($key) + $PTR_ADD $key0,$key,16 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + sub $cnt,1 + _xtr $i0,$s3,16-2 +.Loop_dec: + _xtr $i1,$s0,16-2 + _xtr $i2,$s1,16-2 + _xtr $i3,$s2,16-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t0,3($i0) # Td1[s3>>16] + lwl $t1,3($i1) # Td1[s0>>16] + lwl $t2,3($i2) # Td1[s1>>16] + lwl $t3,3($i3) # Td1[s2>>16] + lwr $t0,2($i0) # Td1[s3>>16] + lwr $t1,2($i1) # Td1[s0>>16] + lwr $t2,2($i2) # Td1[s1>>16] + lwr $t3,2($i3) # Td1[s2>>16] + + _xtr $i0,$s2,8-2 + _xtr $i1,$s3,8-2 + _xtr $i2,$s0,8-2 + _xtr $i3,$s1,8-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t4,2($i0) # Td2[s2>>8] + lwl $t5,2($i1) # Td2[s3>>8] + lwl $t6,2($i2) # Td2[s0>>8] + lwl $t7,2($i3) # Td2[s1>>8] + lwr $t4,1($i0) # Td2[s2>>8] + lwr $t5,1($i1) # Td2[s3>>8] + lwr $t6,1($i2) # Td2[s0>>8] + lwr $t7,1($i3) # Td2[s1>>8] + + _xtr $i0,$s1,0-2 + _xtr $i1,$s2,0-2 + _xtr $i2,$s3,0-2 + _xtr $i3,$s0,0-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lwl $t8,1($i0) # Td3[s1] + lwl $t9,1($i1) # Td3[s2] + lwl $t10,1($i2) # Td3[s3] + lwl $t11,1($i3) # Td3[s0] + lwr $t8,0($i0) # Td3[s1] + lwr $t9,0($i1) # Td3[s2] + lwr $t10,0($i2) # Td3[s3] + lwr $t11,0($i3) # Td3[s0] + + _xtr $i0,$s0,24-2 + _xtr $i1,$s1,24-2 + _xtr $i2,$s2,24-2 + _xtr $i3,$s3,24-2 + and $i0,0x3fc + and $i1,0x3fc + and $i2,0x3fc + and $i3,0x3fc + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + + lw $t4,0($i0) # Td0[s0>>24] + lw $t5,0($i1) # Td0[s1>>24] + lw $t6,0($i2) # Td0[s2>>24] + lw $t7,0($i3) # Td0[s3>>24] + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + sub $cnt,1 + $PTR_ADD $key0,16 + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + .set noreorder + bnez $cnt,.Loop_dec + _xtr $i0,$s3,16-2 + + .set reorder + lw $t4,1024($Tbl) # prefetch Td4 + lw $t5,1024+32($Tbl) + lw $t6,1024+64($Tbl) + lw $t7,1024+96($Tbl) + lw $t8,1024+128($Tbl) + lw $t9,1024+160($Tbl) + lw $t10,1024+192($Tbl) + lw $t11,1024+224($Tbl) + + _xtr $i0,$s3,16 + _xtr $i1,$s0,16 + _xtr $i2,$s1,16 + _xtr $i3,$s2,16 + and $i0,0xff + and $i1,0xff + and $i2,0xff + and $i3,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t0,1024($i0) # Td4[s3>>16] + lbu $t1,1024($i1) # Td4[s0>>16] + lbu $t2,1024($i2) # Td4[s1>>16] + lbu $t3,1024($i3) # Td4[s2>>16] + + _xtr $i0,$s2,8 + _xtr $i1,$s3,8 + _xtr $i2,$s0,8 + _xtr $i3,$s1,8 + and $i0,0xff + and $i1,0xff + and $i2,0xff + and $i3,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,1024($i0) # Td4[s2>>8] + lbu $t5,1024($i1) # Td4[s3>>8] + lbu $t6,1024($i2) # Td4[s0>>8] + lbu $t7,1024($i3) # Td4[s1>>8] + + _xtr $i0,$s0,24 + _xtr $i1,$s1,24 + _xtr $i2,$s2,24 + _xtr $i3,$s3,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t8,1024($i0) # Td4[s0>>24] + lbu $t9,1024($i1) # Td4[s1>>24] + lbu $t10,1024($i2) # Td4[s2>>24] + lbu $t11,1024($i3) # Td4[s3>>24] + + _xtr $i0,$s1,0 + _xtr $i1,$s2,0 + _xtr $i2,$s3,0 + _xtr $i3,$s0,0 + + _ins $t0,16 + _ins $t1,16 + _ins $t2,16 + _ins $t3,16 + + _ins $t4,8 + _ins $t5,8 + _ins $t6,8 + _ins $t7,8 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $t4,1024($i0) # Td4[s1] + lbu $t5,1024($i1) # Td4[s2] + lbu $t6,1024($i2) # Td4[s3] + lbu $t7,1024($i3) # Td4[s0] + + _ins $t8,24 + _ins $t9,24 + _ins $t10,24 + _ins $t11,24 + + lw $s0,0($key0) + lw $s1,4($key0) + lw $s2,8($key0) + lw $s3,12($key0) + + _ins $t4,0 + _ins $t5,0 + _ins $t6,0 + _ins $t7,0 + + + xor $t0,$t8 + xor $t1,$t9 + xor $t2,$t10 + xor $t3,$t11 + + xor $t0,$t4 + xor $t1,$t5 + xor $t2,$t6 + xor $t3,$t7 + + xor $s0,$t0 + xor $s1,$t1 + xor $s2,$t2 + xor $s3,$t3 + + jr $ra +.end _mips_AES_decrypt + +.align 5 +.globl AES_decrypt +.ent AES_decrypt +AES_decrypt: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_S \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_S \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_S \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_decrypt +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Td # PIC-ified 'load address' + + lwl $s0,0+$MSB($inp) + lwl $s1,4+$MSB($inp) + lwl $s2,8+$MSB($inp) + lwl $s3,12+$MSB($inp) + lwr $s0,0+$LSB($inp) + lwr $s1,4+$LSB($inp) + lwr $s2,8+$LSB($inp) + lwr $s3,12+$LSB($inp) + + bal _mips_AES_decrypt + + swr $s0,0+$LSB($out) + swr $s1,4+$LSB($out) + swr $s2,8+$LSB($out) + swr $s3,12+$LSB($out) + swl $s0,0+$MSB($out) + swl $s1,4+$MSB($out) + swl $s2,8+$MSB($out) + swl $s3,12+$MSB($out) + + .set noreorder + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L \$15,$FRAMESIZE-11*$SZREG($sp) + $REG_L \$14,$FRAMESIZE-12*$SZREG($sp) + $REG_L \$13,$FRAMESIZE-13*$SZREG($sp) + $REG_L \$12,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_decrypt +___ +}}} + +{{{ +my $FRAMESIZE=8*$SZREG; +my $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc000f008 : 0xc0000000; + +my ($inp,$bits,$key,$Tbl)=($a0,$a1,$a2,$a3); +my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); +my ($i0,$i1,$i2,$i3)=($at,$t0,$t1,$t2); +my ($rcon,$cnt)=($gp,$fp); + +$code.=<<___; +.align 5 +.ent _mips_AES_set_encrypt_key +_mips_AES_set_encrypt_key: + .frame $sp,0,$ra + .set noreorder + beqz $inp,.Lekey_done + li $t0,-1 + beqz $key,.Lekey_done + $PTR_ADD $rcon,$Tbl,1024+256 + + .set reorder + lwl $rk0,0+$MSB($inp) # load 128 bits + lwl $rk1,4+$MSB($inp) + lwl $rk2,8+$MSB($inp) + lwl $rk3,12+$MSB($inp) + li $at,128 + lwr $rk0,0+$LSB($inp) + lwr $rk1,4+$LSB($inp) + lwr $rk2,8+$LSB($inp) + lwr $rk3,12+$LSB($inp) + .set noreorder + beq $bits,$at,.L128bits + li $cnt,10 + + .set reorder + lwl $rk4,16+$MSB($inp) # load 192 bits + lwl $rk5,20+$MSB($inp) + li $at,192 + lwr $rk4,16+$LSB($inp) + lwr $rk5,20+$LSB($inp) + .set noreorder + beq $bits,$at,.L192bits + li $cnt,8 + + .set reorder + lwl $rk6,24+$MSB($inp) # load 256 bits + lwl $rk7,28+$MSB($inp) + li $at,256 + lwr $rk6,24+$LSB($inp) + lwr $rk7,28+$LSB($inp) + .set noreorder + beq $bits,$at,.L256bits + li $cnt,7 + + b .Lekey_done + li $t0,-2 + +.align 4 +.L128bits: + .set reorder + srl $i0,$rk3,16 + srl $i1,$rk3,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk3,0xff + srl $i3,$rk3,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sub $cnt,1 + $PTR_ADD $key,16 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + + .set noreorder + bnez $cnt,.L128bits + $PTR_ADD $rcon,4 + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + li $cnt,10 + sw $rk3,12($key) + li $t0,0 + sw $cnt,80($key) + b .Lekey_done + $PTR_SUB $key,10*16 + +.align 4 +.L192bits: + .set reorder + srl $i0,$rk5,16 + srl $i1,$rk5,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk5,0xff + srl $i3,$rk5,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sw $rk4,16($key) + sw $rk5,20($key) + sub $cnt,1 + $PTR_ADD $key,24 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + xor $rk4,$rk3 + xor $rk5,$rk4 + + .set noreorder + bnez $cnt,.L192bits + $PTR_ADD $rcon,4 + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + li $cnt,12 + sw $rk3,12($key) + li $t0,0 + sw $cnt,48($key) + b .Lekey_done + $PTR_SUB $key,12*16 + +.align 4 +.L256bits: + .set reorder + srl $i0,$rk7,16 + srl $i1,$rk7,8 + and $i0,0xff + and $i1,0xff + and $i2,$rk7,0xff + srl $i3,$rk7,24 + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + + sw $rk0,0($key) + sw $rk1,4($key) + sw $rk2,8($key) + sw $rk3,12($key) + sw $rk4,16($key) + sw $rk5,20($key) + sw $rk6,24($key) + sw $rk7,28($key) + sub $cnt,1 + + _bias $i0,24 + _bias $i1,16 + _bias $i2,8 + _bias $i3,0 + + xor $rk0,$i0 + lw $i0,0($rcon) + xor $rk0,$i1 + xor $rk0,$i2 + xor $rk0,$i3 + xor $rk0,$i0 + + xor $rk1,$rk0 + xor $rk2,$rk1 + xor $rk3,$rk2 + beqz $cnt,.L256bits_done + + srl $i0,$rk3,24 + srl $i1,$rk3,16 + srl $i2,$rk3,8 + and $i3,$rk3,0xff + and $i1,0xff + and $i2,0xff + $PTR_ADD $i0,$Tbl + $PTR_ADD $i1,$Tbl + $PTR_ADD $i2,$Tbl + $PTR_ADD $i3,$Tbl + lbu $i0,1024($i0) + lbu $i1,1024($i1) + lbu $i2,1024($i2) + lbu $i3,1024($i3) + sll $i0,24 + sll $i1,16 + sll $i2,8 + + xor $rk4,$i0 + xor $rk4,$i1 + xor $rk4,$i2 + xor $rk4,$i3 + + xor $rk5,$rk4 + xor $rk6,$rk5 + xor $rk7,$rk6 + + $PTR_ADD $key,32 + .set noreorder + b .L256bits + $PTR_ADD $rcon,4 + +.L256bits_done: + sw $rk0,32($key) + sw $rk1,36($key) + sw $rk2,40($key) + li $cnt,14 + sw $rk3,44($key) + li $t0,0 + sw $cnt,48($key) + $PTR_SUB $key,12*16 + +.Lekey_done: + jr $ra + nop +.end _mips_AES_set_encrypt_key + +.globl AES_set_encrypt_key +.ent AES_set_encrypt_key +AES_set_encrypt_key: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_set_encrypt_key +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + .set noreorder + move $a0,$t0 + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_set_encrypt_key +___ + +my ($head,$tail)=($inp,$bits); +my ($tp1,$tp2,$tp4,$tp8,$tp9,$tpb,$tpd,$tpe)=($a4,$a5,$a6,$a7,$s0,$s1,$s2,$s3); +my ($m,$x80808080,$x7f7f7f7f,$x1b1b1b1b)=($at,$t0,$t1,$t2); +$code.=<<___; +.align 5 +.globl AES_set_decrypt_key +.ent AES_set_decrypt_key +AES_set_decrypt_key: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-6*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-7*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Tbl + .cpsetup $pf,$zero,AES_set_decrypt_key +___ +$code.=<<___; + .set reorder + la $Tbl,AES_Te # PIC-ified 'load address' + + bal _mips_AES_set_encrypt_key + + bltz $t0,.Ldkey_done + + sll $at,$cnt,4 + $PTR_ADD $head,$key,0 + $PTR_ADD $tail,$key,$at +.align 4 +.Lswap: + lw $rk0,0($head) + lw $rk1,4($head) + lw $rk2,8($head) + lw $rk3,12($head) + lw $rk4,0($tail) + lw $rk5,4($tail) + lw $rk6,8($tail) + lw $rk7,12($tail) + sw $rk0,0($tail) + sw $rk1,4($tail) + sw $rk2,8($tail) + sw $rk3,12($tail) + $PTR_ADD $head,16 + $PTR_SUB $tail,16 + sw $rk4,-16($head) + sw $rk5,-12($head) + sw $rk6,-8($head) + sw $rk7,-4($head) + bne $head,$tail,.Lswap + + lw $tp1,16($key) # modulo-scheduled + lui $x80808080,0x8080 + sub $cnt,1 + or $x80808080,0x8080 + sll $cnt,2 + $PTR_ADD $key,16 + lui $x1b1b1b1b,0x1b1b + nor $x7f7f7f7f,$zero,$x80808080 + or $x1b1b1b1b,0x1b1b +.align 4 +.Lmix: + and $m,$tp1,$x80808080 + and $tp2,$tp1,$x7f7f7f7f + srl $tp4,$m,7 + addu $tp2,$tp2 # tp2<<1 + subu $m,$tp4 + and $m,$x1b1b1b1b + xor $tp2,$m + + and $m,$tp2,$x80808080 + and $tp4,$tp2,$x7f7f7f7f + srl $tp8,$m,7 + addu $tp4,$tp4 # tp4<<1 + subu $m,$tp8 + and $m,$x1b1b1b1b + xor $tp4,$m + + and $m,$tp4,$x80808080 + and $tp8,$tp4,$x7f7f7f7f + srl $tp9,$m,7 + addu $tp8,$tp8 # tp8<<1 + subu $m,$tp9 + and $m,$x1b1b1b1b + xor $tp8,$m + + xor $tp9,$tp8,$tp1 + xor $tpe,$tp8,$tp4 + xor $tpb,$tp9,$tp2 + xor $tpd,$tp9,$tp4 + + _ror $tp1,$tpd,16 + xor $tpe,$tp2 + _ror $tp2,$tpd,-16 + xor $tpe,$tp1 + _ror $tp1,$tp9,8 + xor $tpe,$tp2 + _ror $tp2,$tp9,-24 + xor $tpe,$tp1 + _ror $tp1,$tpb,24 + xor $tpe,$tp2 + _ror $tp2,$tpb,-8 + xor $tpe,$tp1 + lw $tp1,4($key) # modulo-scheduled + xor $tpe,$tp2 + sub $cnt,1 + sw $tpe,0($key) + $PTR_ADD $key,4 + bnez $cnt,.Lmix + + li $t0,0 +.Ldkey_done: + .set noreorder + move $a0,$t0 + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end AES_set_decrypt_key +___ +}}} + +###################################################################### +# Tables are kept in endian-neutral manner +$code.=<<___; +.rdata +.align 6 +AES_Te: +.byte 0xc6,0x63,0x63,0xa5, 0xf8,0x7c,0x7c,0x84 # Te0 +.byte 0xee,0x77,0x77,0x99, 0xf6,0x7b,0x7b,0x8d +.byte 0xff,0xf2,0xf2,0x0d, 0xd6,0x6b,0x6b,0xbd +.byte 0xde,0x6f,0x6f,0xb1, 0x91,0xc5,0xc5,0x54 +.byte 0x60,0x30,0x30,0x50, 0x02,0x01,0x01,0x03 +.byte 0xce,0x67,0x67,0xa9, 0x56,0x2b,0x2b,0x7d +.byte 0xe7,0xfe,0xfe,0x19, 0xb5,0xd7,0xd7,0x62 +.byte 0x4d,0xab,0xab,0xe6, 0xec,0x76,0x76,0x9a +.byte 0x8f,0xca,0xca,0x45, 0x1f,0x82,0x82,0x9d +.byte 0x89,0xc9,0xc9,0x40, 0xfa,0x7d,0x7d,0x87 +.byte 0xef,0xfa,0xfa,0x15, 0xb2,0x59,0x59,0xeb +.byte 0x8e,0x47,0x47,0xc9, 0xfb,0xf0,0xf0,0x0b +.byte 0x41,0xad,0xad,0xec, 0xb3,0xd4,0xd4,0x67 +.byte 0x5f,0xa2,0xa2,0xfd, 0x45,0xaf,0xaf,0xea +.byte 0x23,0x9c,0x9c,0xbf, 0x53,0xa4,0xa4,0xf7 +.byte 0xe4,0x72,0x72,0x96, 0x9b,0xc0,0xc0,0x5b +.byte 0x75,0xb7,0xb7,0xc2, 0xe1,0xfd,0xfd,0x1c +.byte 0x3d,0x93,0x93,0xae, 0x4c,0x26,0x26,0x6a +.byte 0x6c,0x36,0x36,0x5a, 0x7e,0x3f,0x3f,0x41 +.byte 0xf5,0xf7,0xf7,0x02, 0x83,0xcc,0xcc,0x4f +.byte 0x68,0x34,0x34,0x5c, 0x51,0xa5,0xa5,0xf4 +.byte 0xd1,0xe5,0xe5,0x34, 0xf9,0xf1,0xf1,0x08 +.byte 0xe2,0x71,0x71,0x93, 0xab,0xd8,0xd8,0x73 +.byte 0x62,0x31,0x31,0x53, 0x2a,0x15,0x15,0x3f +.byte 0x08,0x04,0x04,0x0c, 0x95,0xc7,0xc7,0x52 +.byte 0x46,0x23,0x23,0x65, 0x9d,0xc3,0xc3,0x5e +.byte 0x30,0x18,0x18,0x28, 0x37,0x96,0x96,0xa1 +.byte 0x0a,0x05,0x05,0x0f, 0x2f,0x9a,0x9a,0xb5 +.byte 0x0e,0x07,0x07,0x09, 0x24,0x12,0x12,0x36 +.byte 0x1b,0x80,0x80,0x9b, 0xdf,0xe2,0xe2,0x3d +.byte 0xcd,0xeb,0xeb,0x26, 0x4e,0x27,0x27,0x69 +.byte 0x7f,0xb2,0xb2,0xcd, 0xea,0x75,0x75,0x9f +.byte 0x12,0x09,0x09,0x1b, 0x1d,0x83,0x83,0x9e +.byte 0x58,0x2c,0x2c,0x74, 0x34,0x1a,0x1a,0x2e +.byte 0x36,0x1b,0x1b,0x2d, 0xdc,0x6e,0x6e,0xb2 +.byte 0xb4,0x5a,0x5a,0xee, 0x5b,0xa0,0xa0,0xfb +.byte 0xa4,0x52,0x52,0xf6, 0x76,0x3b,0x3b,0x4d +.byte 0xb7,0xd6,0xd6,0x61, 0x7d,0xb3,0xb3,0xce +.byte 0x52,0x29,0x29,0x7b, 0xdd,0xe3,0xe3,0x3e +.byte 0x5e,0x2f,0x2f,0x71, 0x13,0x84,0x84,0x97 +.byte 0xa6,0x53,0x53,0xf5, 0xb9,0xd1,0xd1,0x68 +.byte 0x00,0x00,0x00,0x00, 0xc1,0xed,0xed,0x2c +.byte 0x40,0x20,0x20,0x60, 0xe3,0xfc,0xfc,0x1f +.byte 0x79,0xb1,0xb1,0xc8, 0xb6,0x5b,0x5b,0xed +.byte 0xd4,0x6a,0x6a,0xbe, 0x8d,0xcb,0xcb,0x46 +.byte 0x67,0xbe,0xbe,0xd9, 0x72,0x39,0x39,0x4b +.byte 0x94,0x4a,0x4a,0xde, 0x98,0x4c,0x4c,0xd4 +.byte 0xb0,0x58,0x58,0xe8, 0x85,0xcf,0xcf,0x4a +.byte 0xbb,0xd0,0xd0,0x6b, 0xc5,0xef,0xef,0x2a +.byte 0x4f,0xaa,0xaa,0xe5, 0xed,0xfb,0xfb,0x16 +.byte 0x86,0x43,0x43,0xc5, 0x9a,0x4d,0x4d,0xd7 +.byte 0x66,0x33,0x33,0x55, 0x11,0x85,0x85,0x94 +.byte 0x8a,0x45,0x45,0xcf, 0xe9,0xf9,0xf9,0x10 +.byte 0x04,0x02,0x02,0x06, 0xfe,0x7f,0x7f,0x81 +.byte 0xa0,0x50,0x50,0xf0, 0x78,0x3c,0x3c,0x44 +.byte 0x25,0x9f,0x9f,0xba, 0x4b,0xa8,0xa8,0xe3 +.byte 0xa2,0x51,0x51,0xf3, 0x5d,0xa3,0xa3,0xfe +.byte 0x80,0x40,0x40,0xc0, 0x05,0x8f,0x8f,0x8a +.byte 0x3f,0x92,0x92,0xad, 0x21,0x9d,0x9d,0xbc +.byte 0x70,0x38,0x38,0x48, 0xf1,0xf5,0xf5,0x04 +.byte 0x63,0xbc,0xbc,0xdf, 0x77,0xb6,0xb6,0xc1 +.byte 0xaf,0xda,0xda,0x75, 0x42,0x21,0x21,0x63 +.byte 0x20,0x10,0x10,0x30, 0xe5,0xff,0xff,0x1a +.byte 0xfd,0xf3,0xf3,0x0e, 0xbf,0xd2,0xd2,0x6d +.byte 0x81,0xcd,0xcd,0x4c, 0x18,0x0c,0x0c,0x14 +.byte 0x26,0x13,0x13,0x35, 0xc3,0xec,0xec,0x2f +.byte 0xbe,0x5f,0x5f,0xe1, 0x35,0x97,0x97,0xa2 +.byte 0x88,0x44,0x44,0xcc, 0x2e,0x17,0x17,0x39 +.byte 0x93,0xc4,0xc4,0x57, 0x55,0xa7,0xa7,0xf2 +.byte 0xfc,0x7e,0x7e,0x82, 0x7a,0x3d,0x3d,0x47 +.byte 0xc8,0x64,0x64,0xac, 0xba,0x5d,0x5d,0xe7 +.byte 0x32,0x19,0x19,0x2b, 0xe6,0x73,0x73,0x95 +.byte 0xc0,0x60,0x60,0xa0, 0x19,0x81,0x81,0x98 +.byte 0x9e,0x4f,0x4f,0xd1, 0xa3,0xdc,0xdc,0x7f +.byte 0x44,0x22,0x22,0x66, 0x54,0x2a,0x2a,0x7e +.byte 0x3b,0x90,0x90,0xab, 0x0b,0x88,0x88,0x83 +.byte 0x8c,0x46,0x46,0xca, 0xc7,0xee,0xee,0x29 +.byte 0x6b,0xb8,0xb8,0xd3, 0x28,0x14,0x14,0x3c +.byte 0xa7,0xde,0xde,0x79, 0xbc,0x5e,0x5e,0xe2 +.byte 0x16,0x0b,0x0b,0x1d, 0xad,0xdb,0xdb,0x76 +.byte 0xdb,0xe0,0xe0,0x3b, 0x64,0x32,0x32,0x56 +.byte 0x74,0x3a,0x3a,0x4e, 0x14,0x0a,0x0a,0x1e +.byte 0x92,0x49,0x49,0xdb, 0x0c,0x06,0x06,0x0a +.byte 0x48,0x24,0x24,0x6c, 0xb8,0x5c,0x5c,0xe4 +.byte 0x9f,0xc2,0xc2,0x5d, 0xbd,0xd3,0xd3,0x6e +.byte 0x43,0xac,0xac,0xef, 0xc4,0x62,0x62,0xa6 +.byte 0x39,0x91,0x91,0xa8, 0x31,0x95,0x95,0xa4 +.byte 0xd3,0xe4,0xe4,0x37, 0xf2,0x79,0x79,0x8b +.byte 0xd5,0xe7,0xe7,0x32, 0x8b,0xc8,0xc8,0x43 +.byte 0x6e,0x37,0x37,0x59, 0xda,0x6d,0x6d,0xb7 +.byte 0x01,0x8d,0x8d,0x8c, 0xb1,0xd5,0xd5,0x64 +.byte 0x9c,0x4e,0x4e,0xd2, 0x49,0xa9,0xa9,0xe0 +.byte 0xd8,0x6c,0x6c,0xb4, 0xac,0x56,0x56,0xfa +.byte 0xf3,0xf4,0xf4,0x07, 0xcf,0xea,0xea,0x25 +.byte 0xca,0x65,0x65,0xaf, 0xf4,0x7a,0x7a,0x8e +.byte 0x47,0xae,0xae,0xe9, 0x10,0x08,0x08,0x18 +.byte 0x6f,0xba,0xba,0xd5, 0xf0,0x78,0x78,0x88 +.byte 0x4a,0x25,0x25,0x6f, 0x5c,0x2e,0x2e,0x72 +.byte 0x38,0x1c,0x1c,0x24, 0x57,0xa6,0xa6,0xf1 +.byte 0x73,0xb4,0xb4,0xc7, 0x97,0xc6,0xc6,0x51 +.byte 0xcb,0xe8,0xe8,0x23, 0xa1,0xdd,0xdd,0x7c +.byte 0xe8,0x74,0x74,0x9c, 0x3e,0x1f,0x1f,0x21 +.byte 0x96,0x4b,0x4b,0xdd, 0x61,0xbd,0xbd,0xdc +.byte 0x0d,0x8b,0x8b,0x86, 0x0f,0x8a,0x8a,0x85 +.byte 0xe0,0x70,0x70,0x90, 0x7c,0x3e,0x3e,0x42 +.byte 0x71,0xb5,0xb5,0xc4, 0xcc,0x66,0x66,0xaa +.byte 0x90,0x48,0x48,0xd8, 0x06,0x03,0x03,0x05 +.byte 0xf7,0xf6,0xf6,0x01, 0x1c,0x0e,0x0e,0x12 +.byte 0xc2,0x61,0x61,0xa3, 0x6a,0x35,0x35,0x5f +.byte 0xae,0x57,0x57,0xf9, 0x69,0xb9,0xb9,0xd0 +.byte 0x17,0x86,0x86,0x91, 0x99,0xc1,0xc1,0x58 +.byte 0x3a,0x1d,0x1d,0x27, 0x27,0x9e,0x9e,0xb9 +.byte 0xd9,0xe1,0xe1,0x38, 0xeb,0xf8,0xf8,0x13 +.byte 0x2b,0x98,0x98,0xb3, 0x22,0x11,0x11,0x33 +.byte 0xd2,0x69,0x69,0xbb, 0xa9,0xd9,0xd9,0x70 +.byte 0x07,0x8e,0x8e,0x89, 0x33,0x94,0x94,0xa7 +.byte 0x2d,0x9b,0x9b,0xb6, 0x3c,0x1e,0x1e,0x22 +.byte 0x15,0x87,0x87,0x92, 0xc9,0xe9,0xe9,0x20 +.byte 0x87,0xce,0xce,0x49, 0xaa,0x55,0x55,0xff +.byte 0x50,0x28,0x28,0x78, 0xa5,0xdf,0xdf,0x7a +.byte 0x03,0x8c,0x8c,0x8f, 0x59,0xa1,0xa1,0xf8 +.byte 0x09,0x89,0x89,0x80, 0x1a,0x0d,0x0d,0x17 +.byte 0x65,0xbf,0xbf,0xda, 0xd7,0xe6,0xe6,0x31 +.byte 0x84,0x42,0x42,0xc6, 0xd0,0x68,0x68,0xb8 +.byte 0x82,0x41,0x41,0xc3, 0x29,0x99,0x99,0xb0 +.byte 0x5a,0x2d,0x2d,0x77, 0x1e,0x0f,0x0f,0x11 +.byte 0x7b,0xb0,0xb0,0xcb, 0xa8,0x54,0x54,0xfc +.byte 0x6d,0xbb,0xbb,0xd6, 0x2c,0x16,0x16,0x3a + +.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 # Te4 +.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 +.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 +.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 +.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc +.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 +.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a +.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 +.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 +.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 +.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b +.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf +.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 +.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 +.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 +.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 +.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 +.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 +.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 +.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb +.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c +.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 +.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 +.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 +.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 +.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a +.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e +.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e +.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 +.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf +.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 +.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 + +.byte 0x01,0x00,0x00,0x00, 0x02,0x00,0x00,0x00 # rcon +.byte 0x04,0x00,0x00,0x00, 0x08,0x00,0x00,0x00 +.byte 0x10,0x00,0x00,0x00, 0x20,0x00,0x00,0x00 +.byte 0x40,0x00,0x00,0x00, 0x80,0x00,0x00,0x00 +.byte 0x1B,0x00,0x00,0x00, 0x36,0x00,0x00,0x00 + +.align 6 +AES_Td: +.byte 0x51,0xf4,0xa7,0x50, 0x7e,0x41,0x65,0x53 # Td0 +.byte 0x1a,0x17,0xa4,0xc3, 0x3a,0x27,0x5e,0x96 +.byte 0x3b,0xab,0x6b,0xcb, 0x1f,0x9d,0x45,0xf1 +.byte 0xac,0xfa,0x58,0xab, 0x4b,0xe3,0x03,0x93 +.byte 0x20,0x30,0xfa,0x55, 0xad,0x76,0x6d,0xf6 +.byte 0x88,0xcc,0x76,0x91, 0xf5,0x02,0x4c,0x25 +.byte 0x4f,0xe5,0xd7,0xfc, 0xc5,0x2a,0xcb,0xd7 +.byte 0x26,0x35,0x44,0x80, 0xb5,0x62,0xa3,0x8f +.byte 0xde,0xb1,0x5a,0x49, 0x25,0xba,0x1b,0x67 +.byte 0x45,0xea,0x0e,0x98, 0x5d,0xfe,0xc0,0xe1 +.byte 0xc3,0x2f,0x75,0x02, 0x81,0x4c,0xf0,0x12 +.byte 0x8d,0x46,0x97,0xa3, 0x6b,0xd3,0xf9,0xc6 +.byte 0x03,0x8f,0x5f,0xe7, 0x15,0x92,0x9c,0x95 +.byte 0xbf,0x6d,0x7a,0xeb, 0x95,0x52,0x59,0xda +.byte 0xd4,0xbe,0x83,0x2d, 0x58,0x74,0x21,0xd3 +.byte 0x49,0xe0,0x69,0x29, 0x8e,0xc9,0xc8,0x44 +.byte 0x75,0xc2,0x89,0x6a, 0xf4,0x8e,0x79,0x78 +.byte 0x99,0x58,0x3e,0x6b, 0x27,0xb9,0x71,0xdd +.byte 0xbe,0xe1,0x4f,0xb6, 0xf0,0x88,0xad,0x17 +.byte 0xc9,0x20,0xac,0x66, 0x7d,0xce,0x3a,0xb4 +.byte 0x63,0xdf,0x4a,0x18, 0xe5,0x1a,0x31,0x82 +.byte 0x97,0x51,0x33,0x60, 0x62,0x53,0x7f,0x45 +.byte 0xb1,0x64,0x77,0xe0, 0xbb,0x6b,0xae,0x84 +.byte 0xfe,0x81,0xa0,0x1c, 0xf9,0x08,0x2b,0x94 +.byte 0x70,0x48,0x68,0x58, 0x8f,0x45,0xfd,0x19 +.byte 0x94,0xde,0x6c,0x87, 0x52,0x7b,0xf8,0xb7 +.byte 0xab,0x73,0xd3,0x23, 0x72,0x4b,0x02,0xe2 +.byte 0xe3,0x1f,0x8f,0x57, 0x66,0x55,0xab,0x2a +.byte 0xb2,0xeb,0x28,0x07, 0x2f,0xb5,0xc2,0x03 +.byte 0x86,0xc5,0x7b,0x9a, 0xd3,0x37,0x08,0xa5 +.byte 0x30,0x28,0x87,0xf2, 0x23,0xbf,0xa5,0xb2 +.byte 0x02,0x03,0x6a,0xba, 0xed,0x16,0x82,0x5c +.byte 0x8a,0xcf,0x1c,0x2b, 0xa7,0x79,0xb4,0x92 +.byte 0xf3,0x07,0xf2,0xf0, 0x4e,0x69,0xe2,0xa1 +.byte 0x65,0xda,0xf4,0xcd, 0x06,0x05,0xbe,0xd5 +.byte 0xd1,0x34,0x62,0x1f, 0xc4,0xa6,0xfe,0x8a +.byte 0x34,0x2e,0x53,0x9d, 0xa2,0xf3,0x55,0xa0 +.byte 0x05,0x8a,0xe1,0x32, 0xa4,0xf6,0xeb,0x75 +.byte 0x0b,0x83,0xec,0x39, 0x40,0x60,0xef,0xaa +.byte 0x5e,0x71,0x9f,0x06, 0xbd,0x6e,0x10,0x51 +.byte 0x3e,0x21,0x8a,0xf9, 0x96,0xdd,0x06,0x3d +.byte 0xdd,0x3e,0x05,0xae, 0x4d,0xe6,0xbd,0x46 +.byte 0x91,0x54,0x8d,0xb5, 0x71,0xc4,0x5d,0x05 +.byte 0x04,0x06,0xd4,0x6f, 0x60,0x50,0x15,0xff +.byte 0x19,0x98,0xfb,0x24, 0xd6,0xbd,0xe9,0x97 +.byte 0x89,0x40,0x43,0xcc, 0x67,0xd9,0x9e,0x77 +.byte 0xb0,0xe8,0x42,0xbd, 0x07,0x89,0x8b,0x88 +.byte 0xe7,0x19,0x5b,0x38, 0x79,0xc8,0xee,0xdb +.byte 0xa1,0x7c,0x0a,0x47, 0x7c,0x42,0x0f,0xe9 +.byte 0xf8,0x84,0x1e,0xc9, 0x00,0x00,0x00,0x00 +.byte 0x09,0x80,0x86,0x83, 0x32,0x2b,0xed,0x48 +.byte 0x1e,0x11,0x70,0xac, 0x6c,0x5a,0x72,0x4e +.byte 0xfd,0x0e,0xff,0xfb, 0x0f,0x85,0x38,0x56 +.byte 0x3d,0xae,0xd5,0x1e, 0x36,0x2d,0x39,0x27 +.byte 0x0a,0x0f,0xd9,0x64, 0x68,0x5c,0xa6,0x21 +.byte 0x9b,0x5b,0x54,0xd1, 0x24,0x36,0x2e,0x3a +.byte 0x0c,0x0a,0x67,0xb1, 0x93,0x57,0xe7,0x0f +.byte 0xb4,0xee,0x96,0xd2, 0x1b,0x9b,0x91,0x9e +.byte 0x80,0xc0,0xc5,0x4f, 0x61,0xdc,0x20,0xa2 +.byte 0x5a,0x77,0x4b,0x69, 0x1c,0x12,0x1a,0x16 +.byte 0xe2,0x93,0xba,0x0a, 0xc0,0xa0,0x2a,0xe5 +.byte 0x3c,0x22,0xe0,0x43, 0x12,0x1b,0x17,0x1d +.byte 0x0e,0x09,0x0d,0x0b, 0xf2,0x8b,0xc7,0xad +.byte 0x2d,0xb6,0xa8,0xb9, 0x14,0x1e,0xa9,0xc8 +.byte 0x57,0xf1,0x19,0x85, 0xaf,0x75,0x07,0x4c +.byte 0xee,0x99,0xdd,0xbb, 0xa3,0x7f,0x60,0xfd +.byte 0xf7,0x01,0x26,0x9f, 0x5c,0x72,0xf5,0xbc +.byte 0x44,0x66,0x3b,0xc5, 0x5b,0xfb,0x7e,0x34 +.byte 0x8b,0x43,0x29,0x76, 0xcb,0x23,0xc6,0xdc +.byte 0xb6,0xed,0xfc,0x68, 0xb8,0xe4,0xf1,0x63 +.byte 0xd7,0x31,0xdc,0xca, 0x42,0x63,0x85,0x10 +.byte 0x13,0x97,0x22,0x40, 0x84,0xc6,0x11,0x20 +.byte 0x85,0x4a,0x24,0x7d, 0xd2,0xbb,0x3d,0xf8 +.byte 0xae,0xf9,0x32,0x11, 0xc7,0x29,0xa1,0x6d +.byte 0x1d,0x9e,0x2f,0x4b, 0xdc,0xb2,0x30,0xf3 +.byte 0x0d,0x86,0x52,0xec, 0x77,0xc1,0xe3,0xd0 +.byte 0x2b,0xb3,0x16,0x6c, 0xa9,0x70,0xb9,0x99 +.byte 0x11,0x94,0x48,0xfa, 0x47,0xe9,0x64,0x22 +.byte 0xa8,0xfc,0x8c,0xc4, 0xa0,0xf0,0x3f,0x1a +.byte 0x56,0x7d,0x2c,0xd8, 0x22,0x33,0x90,0xef +.byte 0x87,0x49,0x4e,0xc7, 0xd9,0x38,0xd1,0xc1 +.byte 0x8c,0xca,0xa2,0xfe, 0x98,0xd4,0x0b,0x36 +.byte 0xa6,0xf5,0x81,0xcf, 0xa5,0x7a,0xde,0x28 +.byte 0xda,0xb7,0x8e,0x26, 0x3f,0xad,0xbf,0xa4 +.byte 0x2c,0x3a,0x9d,0xe4, 0x50,0x78,0x92,0x0d +.byte 0x6a,0x5f,0xcc,0x9b, 0x54,0x7e,0x46,0x62 +.byte 0xf6,0x8d,0x13,0xc2, 0x90,0xd8,0xb8,0xe8 +.byte 0x2e,0x39,0xf7,0x5e, 0x82,0xc3,0xaf,0xf5 +.byte 0x9f,0x5d,0x80,0xbe, 0x69,0xd0,0x93,0x7c +.byte 0x6f,0xd5,0x2d,0xa9, 0xcf,0x25,0x12,0xb3 +.byte 0xc8,0xac,0x99,0x3b, 0x10,0x18,0x7d,0xa7 +.byte 0xe8,0x9c,0x63,0x6e, 0xdb,0x3b,0xbb,0x7b +.byte 0xcd,0x26,0x78,0x09, 0x6e,0x59,0x18,0xf4 +.byte 0xec,0x9a,0xb7,0x01, 0x83,0x4f,0x9a,0xa8 +.byte 0xe6,0x95,0x6e,0x65, 0xaa,0xff,0xe6,0x7e +.byte 0x21,0xbc,0xcf,0x08, 0xef,0x15,0xe8,0xe6 +.byte 0xba,0xe7,0x9b,0xd9, 0x4a,0x6f,0x36,0xce +.byte 0xea,0x9f,0x09,0xd4, 0x29,0xb0,0x7c,0xd6 +.byte 0x31,0xa4,0xb2,0xaf, 0x2a,0x3f,0x23,0x31 +.byte 0xc6,0xa5,0x94,0x30, 0x35,0xa2,0x66,0xc0 +.byte 0x74,0x4e,0xbc,0x37, 0xfc,0x82,0xca,0xa6 +.byte 0xe0,0x90,0xd0,0xb0, 0x33,0xa7,0xd8,0x15 +.byte 0xf1,0x04,0x98,0x4a, 0x41,0xec,0xda,0xf7 +.byte 0x7f,0xcd,0x50,0x0e, 0x17,0x91,0xf6,0x2f +.byte 0x76,0x4d,0xd6,0x8d, 0x43,0xef,0xb0,0x4d +.byte 0xcc,0xaa,0x4d,0x54, 0xe4,0x96,0x04,0xdf +.byte 0x9e,0xd1,0xb5,0xe3, 0x4c,0x6a,0x88,0x1b +.byte 0xc1,0x2c,0x1f,0xb8, 0x46,0x65,0x51,0x7f +.byte 0x9d,0x5e,0xea,0x04, 0x01,0x8c,0x35,0x5d +.byte 0xfa,0x87,0x74,0x73, 0xfb,0x0b,0x41,0x2e +.byte 0xb3,0x67,0x1d,0x5a, 0x92,0xdb,0xd2,0x52 +.byte 0xe9,0x10,0x56,0x33, 0x6d,0xd6,0x47,0x13 +.byte 0x9a,0xd7,0x61,0x8c, 0x37,0xa1,0x0c,0x7a +.byte 0x59,0xf8,0x14,0x8e, 0xeb,0x13,0x3c,0x89 +.byte 0xce,0xa9,0x27,0xee, 0xb7,0x61,0xc9,0x35 +.byte 0xe1,0x1c,0xe5,0xed, 0x7a,0x47,0xb1,0x3c +.byte 0x9c,0xd2,0xdf,0x59, 0x55,0xf2,0x73,0x3f +.byte 0x18,0x14,0xce,0x79, 0x73,0xc7,0x37,0xbf +.byte 0x53,0xf7,0xcd,0xea, 0x5f,0xfd,0xaa,0x5b +.byte 0xdf,0x3d,0x6f,0x14, 0x78,0x44,0xdb,0x86 +.byte 0xca,0xaf,0xf3,0x81, 0xb9,0x68,0xc4,0x3e +.byte 0x38,0x24,0x34,0x2c, 0xc2,0xa3,0x40,0x5f +.byte 0x16,0x1d,0xc3,0x72, 0xbc,0xe2,0x25,0x0c +.byte 0x28,0x3c,0x49,0x8b, 0xff,0x0d,0x95,0x41 +.byte 0x39,0xa8,0x01,0x71, 0x08,0x0c,0xb3,0xde +.byte 0xd8,0xb4,0xe4,0x9c, 0x64,0x56,0xc1,0x90 +.byte 0x7b,0xcb,0x84,0x61, 0xd5,0x32,0xb6,0x70 +.byte 0x48,0x6c,0x5c,0x74, 0xd0,0xb8,0x57,0x42 + +.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 # Td4 +.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb +.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 +.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb +.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d +.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e +.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 +.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 +.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 +.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 +.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda +.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 +.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a +.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 +.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 +.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b +.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea +.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 +.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 +.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e +.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 +.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b +.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 +.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 +.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 +.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f +.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d +.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef +.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 +.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 +.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 +.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + # made-up _instructions, _xtr, _ins, _ror and _bias, cope + # with byte order dependencies... + if (/^\s+_/) { + s/(_[a-z]+\s+)(\$[0-9]+),([^,]+)(#.*)*$/$1$2,$2,$3/; + + s/_xtr\s+(\$[0-9]+),(\$[0-9]+),([0-9]+(\-2)*)/ + sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) + : eval("24-$3"))/e or + s/_ins\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) + : eval("24-$3"))/e or + s/_ror\s+(\$[0-9]+),(\$[0-9]+),(\-?[0-9]+)/ + sprintf("srl\t$1,$2,%d",$big_endian ? eval($3) + : eval("$3*-1"))/e or + s/_bias\s+(\$[0-9]+),(\$[0-9]+),([0-9]+)/ + sprintf("sll\t$1,$2,%d",$big_endian ? eval($3) + : eval("($3-16)&31"))/e; + + s/srl\s+(\$[0-9]+),(\$[0-9]+),\-([0-9]+)/ + sprintf("sll\t$1,$2,$3")/e or + s/srl\s+(\$[0-9]+),(\$[0-9]+),0/ + sprintf("and\t$1,$2,0xff")/e or + s/(sll\s+\$[0-9]+,\$[0-9]+,0)/#$1/; + } + + # convert lwl/lwr and swr/swl to little-endian order + if (!$big_endian && /^\s+[sl]w[lr]\s+/) { + s/([sl]wl.*)([0-9]+)\((\$[0-9]+)\)/ + sprintf("$1%d($3)",eval("$2-$2%4+($2%4-1)&3"))/e or + s/([sl]wr.*)([0-9]+)\((\$[0-9]+)\)/ + sprintf("$1%d($3)",eval("$2-$2%4+($2%4+1)&3"))/e; + } + + print $_,"\n"; +} + +close STDOUT; diff --git a/src/lib/libcrypto/aes/asm/aes-parisc.pl b/src/lib/libcrypto/aes/asm/aes-parisc.pl new file mode 100644 index 0000000000..c36b6a2270 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aes-parisc.pl @@ -0,0 +1,1021 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# AES for PA-RISC. +# +# June 2009. +# +# The module is mechanical transliteration of aes-sparcv9.pl, but with +# a twist: S-boxes are compressed even further down to 1K+256B. On +# PA-7100LC performance is ~40% better than gcc 3.2 generated code and +# is about 33 cycles per byte processed with 128-bit key. Newer CPUs +# perform at 16 cycles per byte. It's not faster than code generated +# by vendor compiler, but recall that it has compressed S-boxes, which +# requires extra processing. +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$inp="%r26"; # arg0 +$out="%r25"; # arg1 +$key="%r24"; # arg2 + +($s0,$s1,$s2,$s3) = ("%r1","%r2","%r3","%r4"); +($t0,$t1,$t2,$t3) = ("%r5","%r6","%r7","%r8"); + +($acc0, $acc1, $acc2, $acc3, $acc4, $acc5, $acc6, $acc7, + $acc8, $acc9,$acc10,$acc11,$acc12,$acc13,$acc14,$acc15) = +("%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16", +"%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r26"); + +$tbl="%r28"; +$rounds="%r29"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT AES_encrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +AES_encrypt + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + blr %r0,$tbl + ldi 3,$t0 +L\$enc_pic + andcm $tbl,$t0,$tbl + ldo L\$AES_Te-L\$enc_pic($tbl),$tbl + + and $inp,$t0,$t0 + sub $inp,$t0,$inp + ldw 0($inp),$s0 + ldw 4($inp),$s1 + ldw 8($inp),$s2 + comib,= 0,$t0,L\$enc_inp_aligned + ldw 12($inp),$s3 + + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 + ldw 16($inp),$t1 + vshd $s0,$s1,$s0 + vshd $s1,$s2,$s1 + vshd $s2,$s3,$s2 + vshd $s3,$t1,$s3 + +L\$enc_inp_aligned + bl _parisc_AES_encrypt,%r31 + nop + + extru,<> $out,31,2,%r0 + b L\$enc_out_aligned + nop + + _srm $s0,24,$acc0 + _srm $s0,16,$acc1 + stb $acc0,0($out) + _srm $s0,8,$acc2 + stb $acc1,1($out) + _srm $s1,24,$acc4 + stb $acc2,2($out) + _srm $s1,16,$acc5 + stb $s0,3($out) + _srm $s1,8,$acc6 + stb $acc4,4($out) + _srm $s2,24,$acc0 + stb $acc5,5($out) + _srm $s2,16,$acc1 + stb $acc6,6($out) + _srm $s2,8,$acc2 + stb $s1,7($out) + _srm $s3,24,$acc4 + stb $acc0,8($out) + _srm $s3,16,$acc5 + stb $acc1,9($out) + _srm $s3,8,$acc6 + stb $acc2,10($out) + stb $s2,11($out) + stb $acc4,12($out) + stb $acc5,13($out) + stb $acc6,14($out) + b L\$enc_done + stb $s3,15($out) + +L\$enc_out_aligned + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + +L\$enc_done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 16 +_parisc_AES_encrypt + .PROC + .CALLINFO MILLICODE + .ENTRY + ldw 240($key),$rounds + ldw 0($key),$t0 + ldw 4($key),$t1 + ldw 8($key),$t2 + _srm $rounds,1,$rounds + xor $t0,$s0,$s0 + ldw 12($key),$t3 + _srm $s0,24,$acc0 + xor $t1,$s1,$s1 + ldw 16($key),$t0 + _srm $s1,16,$acc1 + xor $t2,$s2,$s2 + ldw 20($key),$t1 + xor $t3,$s3,$s3 + ldw 24($key),$t2 + ldw 28($key),$t3 +L\$enc_loop + _srm $s2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $s3,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $s1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $s2,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $s3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $s0,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $s2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $s3,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $s0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $s1,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $s3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $s0,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $s1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $s2,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + ldwx,s $acc14($tbl),$acc14 + ldwx,s $acc15($tbl),$acc15 + addib,= -1,$rounds,L\$enc_last + ldo 32($key),$key + + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + _srm $t1,16,$acc1 + xor $acc15,$t3,$t3 + + _srm $t2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $t3,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $t1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $t2,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $t3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $t0,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $t2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $t3,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $t0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $t1,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $t3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $t0,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $t1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $t2,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + _ror $acc1,8,$acc1 + ldwx,s $acc14($tbl),$acc14 + + _ror $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldwx,s $acc15($tbl),$acc15 + _ror $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ldw 16($key),$t0 + _ror $acc5,8,$acc5 + xor $acc2,$s0,$s0 + ldw 20($key),$t1 + _ror $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ldw 24($key),$t2 + _ror $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ldw 28($key),$t3 + _ror $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldw 1024+0($tbl),%r0 ; prefetch te4 + _ror $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldw 1024+32($tbl),%r0 ; prefetch te4 + _ror $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldw 1024+64($tbl),%r0 ; prefetch te4 + _ror $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldw 1024+96($tbl),%r0 ; prefetch te4 + _ror $acc14,16,$acc14 + xor $acc9,$s2,$s2 + ldw 1024+128($tbl),%r0 ; prefetch te4 + _ror $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldw 1024+160($tbl),%r0 ; prefetch te4 + _srm $s0,24,$acc0 + xor $acc11,$s2,$s2 + ldw 1024+192($tbl),%r0 ; prefetch te4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldw 1024+224($tbl),%r0 ; prefetch te4 + _srm $s1,16,$acc1 + xor $acc14,$s3,$s3 + b L\$enc_loop + xor $acc15,$s3,$s3 + + .ALIGN 16 +L\$enc_last + ldo 1024($tbl),$rounds + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + _srm $t1,16,$acc1 + xor $acc15,$t3,$t3 + + _srm $t2,8,$acc2 + ldbx $acc0($rounds),$acc0 + _srm $t1,24,$acc4 + ldbx $acc1($rounds),$acc1 + _srm $t2,16,$acc5 + _srm $t3,0,$acc3 + ldbx $acc2($rounds),$acc2 + ldbx $acc3($rounds),$acc3 + _srm $t3,8,$acc6 + ldbx $acc4($rounds),$acc4 + _srm $t2,24,$acc8 + ldbx $acc5($rounds),$acc5 + _srm $t3,16,$acc9 + _srm $t0,0,$acc7 + ldbx $acc6($rounds),$acc6 + ldbx $acc7($rounds),$acc7 + _srm $t0,8,$acc10 + ldbx $acc8($rounds),$acc8 + _srm $t3,24,$acc12 + ldbx $acc9($rounds),$acc9 + _srm $t0,16,$acc13 + _srm $t1,0,$acc11 + ldbx $acc10($rounds),$acc10 + _srm $t1,8,$acc14 + ldbx $acc11($rounds),$acc11 + ldbx $acc12($rounds),$acc12 + ldbx $acc13($rounds),$acc13 + _srm $t2,0,$acc15 + ldbx $acc14($rounds),$acc14 + + dep $acc0,7,8,$acc3 + ldbx $acc15($rounds),$acc15 + dep $acc4,7,8,$acc7 + dep $acc1,15,8,$acc3 + dep $acc5,15,8,$acc7 + dep $acc2,23,8,$acc3 + dep $acc6,23,8,$acc7 + xor $acc3,$s0,$s0 + xor $acc7,$s1,$s1 + dep $acc8,7,8,$acc11 + dep $acc12,7,8,$acc15 + dep $acc9,15,8,$acc11 + dep $acc13,15,8,$acc15 + dep $acc10,23,8,$acc11 + dep $acc14,23,8,$acc15 + xor $acc11,$s2,$s2 + + bv (%r31) + .EXIT + xor $acc15,$s3,$s3 + .PROCEND + + .ALIGN 64 +L\$AES_Te + .WORD 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d + .WORD 0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554 + .WORD 0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d + .WORD 0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a + .WORD 0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87 + .WORD 0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b + .WORD 0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea + .WORD 0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b + .WORD 0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a + .WORD 0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f + .WORD 0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108 + .WORD 0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f + .WORD 0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e + .WORD 0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5 + .WORD 0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d + .WORD 0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f + .WORD 0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e + .WORD 0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb + .WORD 0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce + .WORD 0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497 + .WORD 0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c + .WORD 0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed + .WORD 0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b + .WORD 0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a + .WORD 0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16 + .WORD 0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594 + .WORD 0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81 + .WORD 0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3 + .WORD 0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a + .WORD 0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504 + .WORD 0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163 + .WORD 0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d + .WORD 0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f + .WORD 0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739 + .WORD 0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47 + .WORD 0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395 + .WORD 0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f + .WORD 0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883 + .WORD 0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c + .WORD 0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76 + .WORD 0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e + .WORD 0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4 + .WORD 0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6 + .WORD 0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b + .WORD 0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7 + .WORD 0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0 + .WORD 0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25 + .WORD 0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818 + .WORD 0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72 + .WORD 0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651 + .WORD 0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21 + .WORD 0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85 + .WORD 0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa + .WORD 0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12 + .WORD 0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0 + .WORD 0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9 + .WORD 0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133 + .WORD 0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7 + .WORD 0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920 + .WORD 0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a + .WORD 0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17 + .WORD 0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8 + .WORD 0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11 + .WORD 0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a + .BYTE 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5 + .BYTE 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 + .BYTE 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0 + .BYTE 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 + .BYTE 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc + .BYTE 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 + .BYTE 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a + .BYTE 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 + .BYTE 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0 + .BYTE 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 + .BYTE 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b + .BYTE 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf + .BYTE 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85 + .BYTE 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 + .BYTE 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5 + .BYTE 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 + .BYTE 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17 + .BYTE 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 + .BYTE 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88 + .BYTE 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb + .BYTE 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c + .BYTE 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 + .BYTE 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9 + .BYTE 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 + .BYTE 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6 + .BYTE 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a + .BYTE 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e + .BYTE 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e + .BYTE 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94 + .BYTE 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf + .BYTE 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68 + .BYTE 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 +___ + +$code.=<<___; + .EXPORT AES_decrypt,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 16 +AES_decrypt + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + blr %r0,$tbl + ldi 3,$t0 +L\$dec_pic + andcm $tbl,$t0,$tbl + ldo L\$AES_Td-L\$dec_pic($tbl),$tbl + + and $inp,$t0,$t0 + sub $inp,$t0,$inp + ldw 0($inp),$s0 + ldw 4($inp),$s1 + ldw 8($inp),$s2 + comib,= 0,$t0,L\$dec_inp_aligned + ldw 12($inp),$s3 + + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 + ldw 16($inp),$t1 + vshd $s0,$s1,$s0 + vshd $s1,$s2,$s1 + vshd $s2,$s3,$s2 + vshd $s3,$t1,$s3 + +L\$dec_inp_aligned + bl _parisc_AES_decrypt,%r31 + nop + + extru,<> $out,31,2,%r0 + b L\$dec_out_aligned + nop + + _srm $s0,24,$acc0 + _srm $s0,16,$acc1 + stb $acc0,0($out) + _srm $s0,8,$acc2 + stb $acc1,1($out) + _srm $s1,24,$acc4 + stb $acc2,2($out) + _srm $s1,16,$acc5 + stb $s0,3($out) + _srm $s1,8,$acc6 + stb $acc4,4($out) + _srm $s2,24,$acc0 + stb $acc5,5($out) + _srm $s2,16,$acc1 + stb $acc6,6($out) + _srm $s2,8,$acc2 + stb $s1,7($out) + _srm $s3,24,$acc4 + stb $acc0,8($out) + _srm $s3,16,$acc5 + stb $acc1,9($out) + _srm $s3,8,$acc6 + stb $acc2,10($out) + stb $s2,11($out) + stb $acc4,12($out) + stb $acc5,13($out) + stb $acc6,14($out) + b L\$dec_done + stb $s3,15($out) + +L\$dec_out_aligned + stw $s0,0($out) + stw $s1,4($out) + stw $s2,8($out) + stw $s3,12($out) + +L\$dec_done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 16 +_parisc_AES_decrypt + .PROC + .CALLINFO MILLICODE + .ENTRY + ldw 240($key),$rounds + ldw 0($key),$t0 + ldw 4($key),$t1 + ldw 8($key),$t2 + ldw 12($key),$t3 + _srm $rounds,1,$rounds + xor $t0,$s0,$s0 + ldw 16($key),$t0 + xor $t1,$s1,$s1 + ldw 20($key),$t1 + _srm $s0,24,$acc0 + xor $t2,$s2,$s2 + ldw 24($key),$t2 + xor $t3,$s3,$s3 + ldw 28($key),$t3 + _srm $s3,16,$acc1 +L\$dec_loop + _srm $s2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $s1,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $s1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $s0,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $s3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $s2,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $s2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $s1,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $s0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $s3,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $s3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $s2,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $s1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $s0,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + ldwx,s $acc14($tbl),$acc14 + ldwx,s $acc15($tbl),$acc15 + addib,= -1,$rounds,L\$dec_last + ldo 32($key),$key + + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + _srm $t3,16,$acc1 + + _srm $t2,8,$acc2 + ldwx,s $acc0($tbl),$acc0 + _srm $t1,0,$acc3 + ldwx,s $acc1($tbl),$acc1 + _srm $t1,24,$acc4 + ldwx,s $acc2($tbl),$acc2 + _srm $t0,16,$acc5 + ldwx,s $acc3($tbl),$acc3 + _srm $t3,8,$acc6 + ldwx,s $acc4($tbl),$acc4 + _srm $t2,0,$acc7 + ldwx,s $acc5($tbl),$acc5 + _srm $t2,24,$acc8 + ldwx,s $acc6($tbl),$acc6 + _srm $t1,16,$acc9 + ldwx,s $acc7($tbl),$acc7 + _srm $t0,8,$acc10 + ldwx,s $acc8($tbl),$acc8 + _srm $t3,0,$acc11 + ldwx,s $acc9($tbl),$acc9 + _srm $t3,24,$acc12 + ldwx,s $acc10($tbl),$acc10 + _srm $t2,16,$acc13 + ldwx,s $acc11($tbl),$acc11 + _srm $t1,8,$acc14 + ldwx,s $acc12($tbl),$acc12 + _srm $t0,0,$acc15 + ldwx,s $acc13($tbl),$acc13 + _ror $acc1,8,$acc1 + ldwx,s $acc14($tbl),$acc14 + + _ror $acc2,16,$acc2 + xor $acc0,$s0,$s0 + ldwx,s $acc15($tbl),$acc15 + _ror $acc3,24,$acc3 + xor $acc1,$s0,$s0 + ldw 16($key),$t0 + _ror $acc5,8,$acc5 + xor $acc2,$s0,$s0 + ldw 20($key),$t1 + _ror $acc6,16,$acc6 + xor $acc3,$s0,$s0 + ldw 24($key),$t2 + _ror $acc7,24,$acc7 + xor $acc4,$s1,$s1 + ldw 28($key),$t3 + _ror $acc9,8,$acc9 + xor $acc5,$s1,$s1 + ldw 1024+0($tbl),%r0 ; prefetch td4 + _ror $acc10,16,$acc10 + xor $acc6,$s1,$s1 + ldw 1024+32($tbl),%r0 ; prefetch td4 + _ror $acc11,24,$acc11 + xor $acc7,$s1,$s1 + ldw 1024+64($tbl),%r0 ; prefetch td4 + _ror $acc13,8,$acc13 + xor $acc8,$s2,$s2 + ldw 1024+96($tbl),%r0 ; prefetch td4 + _ror $acc14,16,$acc14 + xor $acc9,$s2,$s2 + ldw 1024+128($tbl),%r0 ; prefetch td4 + _ror $acc15,24,$acc15 + xor $acc10,$s2,$s2 + ldw 1024+160($tbl),%r0 ; prefetch td4 + _srm $s0,24,$acc0 + xor $acc11,$s2,$s2 + ldw 1024+192($tbl),%r0 ; prefetch td4 + xor $acc12,$acc14,$acc14 + xor $acc13,$s3,$s3 + ldw 1024+224($tbl),%r0 ; prefetch td4 + xor $acc14,$s3,$s3 + xor $acc15,$s3,$s3 + b L\$dec_loop + _srm $s3,16,$acc1 + + .ALIGN 16 +L\$dec_last + ldo 1024($tbl),$rounds + _ror $acc1,8,$acc1 + xor $acc0,$t0,$t0 + ldw 0($key),$s0 + _ror $acc2,16,$acc2 + xor $acc1,$t0,$t0 + ldw 4($key),$s1 + _ror $acc3,24,$acc3 + xor $acc2,$t0,$t0 + ldw 8($key),$s2 + _ror $acc5,8,$acc5 + xor $acc3,$t0,$t0 + ldw 12($key),$s3 + _ror $acc6,16,$acc6 + xor $acc4,$t1,$t1 + _ror $acc7,24,$acc7 + xor $acc5,$t1,$t1 + _ror $acc9,8,$acc9 + xor $acc6,$t1,$t1 + _ror $acc10,16,$acc10 + xor $acc7,$t1,$t1 + _ror $acc11,24,$acc11 + xor $acc8,$t2,$t2 + _ror $acc13,8,$acc13 + xor $acc9,$t2,$t2 + _ror $acc14,16,$acc14 + xor $acc10,$t2,$t2 + _ror $acc15,24,$acc15 + xor $acc11,$t2,$t2 + xor $acc12,$acc14,$acc14 + xor $acc13,$t3,$t3 + _srm $t0,24,$acc0 + xor $acc14,$t3,$t3 + xor $acc15,$t3,$t3 + _srm $t3,16,$acc1 + + _srm $t2,8,$acc2 + ldbx $acc0($rounds),$acc0 + _srm $t1,24,$acc4 + ldbx $acc1($rounds),$acc1 + _srm $t0,16,$acc5 + _srm $t1,0,$acc3 + ldbx $acc2($rounds),$acc2 + ldbx $acc3($rounds),$acc3 + _srm $t3,8,$acc6 + ldbx $acc4($rounds),$acc4 + _srm $t2,24,$acc8 + ldbx $acc5($rounds),$acc5 + _srm $t1,16,$acc9 + _srm $t2,0,$acc7 + ldbx $acc6($rounds),$acc6 + ldbx $acc7($rounds),$acc7 + _srm $t0,8,$acc10 + ldbx $acc8($rounds),$acc8 + _srm $t3,24,$acc12 + ldbx $acc9($rounds),$acc9 + _srm $t2,16,$acc13 + _srm $t3,0,$acc11 + ldbx $acc10($rounds),$acc10 + _srm $t1,8,$acc14 + ldbx $acc11($rounds),$acc11 + ldbx $acc12($rounds),$acc12 + ldbx $acc13($rounds),$acc13 + _srm $t0,0,$acc15 + ldbx $acc14($rounds),$acc14 + + dep $acc0,7,8,$acc3 + ldbx $acc15($rounds),$acc15 + dep $acc4,7,8,$acc7 + dep $acc1,15,8,$acc3 + dep $acc5,15,8,$acc7 + dep $acc2,23,8,$acc3 + dep $acc6,23,8,$acc7 + xor $acc3,$s0,$s0 + xor $acc7,$s1,$s1 + dep $acc8,7,8,$acc11 + dep $acc12,7,8,$acc15 + dep $acc9,15,8,$acc11 + dep $acc13,15,8,$acc15 + dep $acc10,23,8,$acc11 + dep $acc14,23,8,$acc15 + xor $acc11,$s2,$s2 + + bv (%r31) + .EXIT + xor $acc15,$s3,$s3 + .PROCEND + + .ALIGN 64 +L\$AES_Td + .WORD 0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96 + .WORD 0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393 + .WORD 0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25 + .WORD 0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f + .WORD 0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1 + .WORD 0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6 + .WORD 0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da + .WORD 0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844 + .WORD 0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd + .WORD 0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4 + .WORD 0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45 + .WORD 0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94 + .WORD 0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7 + .WORD 0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a + .WORD 0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5 + .WORD 0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c + .WORD 0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1 + .WORD 0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a + .WORD 0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75 + .WORD 0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051 + .WORD 0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46 + .WORD 0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff + .WORD 0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77 + .WORD 0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb + .WORD 0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000 + .WORD 0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e + .WORD 0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927 + .WORD 0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a + .WORD 0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e + .WORD 0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16 + .WORD 0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d + .WORD 0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8 + .WORD 0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd + .WORD 0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34 + .WORD 0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163 + .WORD 0xd731dcca, 0x42638510, 0x13972240, 0x84c61120 + .WORD 0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d + .WORD 0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0 + .WORD 0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422 + .WORD 0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef + .WORD 0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36 + .WORD 0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4 + .WORD 0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662 + .WORD 0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5 + .WORD 0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3 + .WORD 0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b + .WORD 0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8 + .WORD 0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6 + .WORD 0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6 + .WORD 0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0 + .WORD 0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815 + .WORD 0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f + .WORD 0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df + .WORD 0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f + .WORD 0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e + .WORD 0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713 + .WORD 0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89 + .WORD 0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c + .WORD 0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf + .WORD 0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86 + .WORD 0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f + .WORD 0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541 + .WORD 0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190 + .WORD 0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742 + .BYTE 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38 + .BYTE 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb + .BYTE 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87 + .BYTE 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb + .BYTE 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d + .BYTE 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e + .BYTE 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2 + .BYTE 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 + .BYTE 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16 + .BYTE 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 + .BYTE 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda + .BYTE 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 + .BYTE 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a + .BYTE 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 + .BYTE 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02 + .BYTE 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b + .BYTE 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea + .BYTE 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 + .BYTE 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85 + .BYTE 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e + .BYTE 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89 + .BYTE 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b + .BYTE 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20 + .BYTE 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 + .BYTE 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31 + .BYTE 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f + .BYTE 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d + .BYTE 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef + .BYTE 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0 + .BYTE 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 + .BYTE 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26 + .BYTE 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d + .STRINGZ "AES for PA-RISC, CRYPTOGAMS by " +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + # translate made up instructons: _ror, _srm + s/_ror(\s+)(%r[0-9]+),/shd$1$2,$2,/ or + + s/_srm(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("extru%s,%d,8,",$1,31-$2) + : sprintf("extrd,u%s,%d,8,",$1,63-$2)/e; + + s/,\*/,/ if ($SIZE_T==4); + print $_,"\n"; +} +close STDOUT; diff --git a/src/lib/libcrypto/aes/asm/aes-ppc.pl b/src/lib/libcrypto/aes/asm/aes-ppc.pl index f82c5e1814..7c52cbe5f9 100644 --- a/src/lib/libcrypto/aes/asm/aes-ppc.pl +++ b/src/lib/libcrypto/aes/asm/aes-ppc.pl @@ -7,7 +7,7 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== -# Needs more work: key setup, page boundaries, CBC routine... +# Needs more work: key setup, CBC routine... # # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with # 128-bit key, which is ~40% better than 64-bit code generated by gcc @@ -18,7 +18,7 @@ # February 2010 # -# Rescheduling instructions to favour Power6 pipeline gives 10% +# Rescheduling instructions to favour Power6 pipeline gave 10% # performance improvement on the platfrom in question (and marginal # improvement even on others). It should be noted that Power6 fails # to process byte in 18 cycles, only in 23, because it fails to issue @@ -33,11 +33,13 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; + $LRSAVE =2*$SIZE_T; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; + $LRSAVE =$SIZE_T; $STU ="stwu"; $POP ="lwz"; $PUSH ="stw"; @@ -116,15 +118,19 @@ LAES_Te: addi $Tbl0,$Tbl0,`128-8` mtlr r0 blr - .space `32-24` + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` LAES_Td: mflr r0 bcl 20,31,\$+4 mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry - addi $Tbl0,$Tbl0,`128-8-32+2048+256` + addi $Tbl0,$Tbl0,`128-64-8+2048+256` mtlr r0 blr - .space `128-32-24` + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `128-64-9*4` ___ &_data_word( 0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d, @@ -328,10 +334,9 @@ $code.=<<___; .globl .AES_encrypt .align 7 .AES_encrypt: - mflr r0 $STU $sp,-$FRAME($sp) + mflr r0 - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -352,7 +357,14 @@ $code.=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + + andi. $t0,$inp,3 + andi. $t1,$out,3 + or. $t0,$t0,$t1 + bne Lenc_unaligned +Lenc_unaligned_ok: lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) @@ -363,8 +375,80 @@ $code.=<<___; stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) + b Lenc_done + +Lenc_unaligned: + subfic $t0,$inp,4096 + subfic $t1,$out,4096 + andi. $t0,$t0,4096-16 + beq Lenc_xpage + andi. $t1,$t1,4096-16 + bne Lenc_unaligned_ok + +Lenc_xpage: + lbz $acc00,0($inp) + lbz $acc01,1($inp) + lbz $acc02,2($inp) + lbz $s0,3($inp) + lbz $acc04,4($inp) + lbz $acc05,5($inp) + lbz $acc06,6($inp) + lbz $s1,7($inp) + lbz $acc08,8($inp) + lbz $acc09,9($inp) + lbz $acc10,10($inp) + insrwi $s0,$acc00,8,0 + lbz $s2,11($inp) + insrwi $s1,$acc04,8,0 + lbz $acc12,12($inp) + insrwi $s0,$acc01,8,8 + lbz $acc13,13($inp) + insrwi $s1,$acc05,8,8 + lbz $acc14,14($inp) + insrwi $s0,$acc02,8,16 + lbz $s3,15($inp) + insrwi $s1,$acc06,8,16 + insrwi $s2,$acc08,8,0 + insrwi $s3,$acc12,8,0 + insrwi $s2,$acc09,8,8 + insrwi $s3,$acc13,8,8 + insrwi $s2,$acc10,8,16 + insrwi $s3,$acc14,8,16 + + bl LAES_Te + bl Lppc_AES_encrypt_compact + + extrwi $acc00,$s0,8,0 + extrwi $acc01,$s0,8,8 + stb $acc00,0($out) + extrwi $acc02,$s0,8,16 + stb $acc01,1($out) + stb $acc02,2($out) + extrwi $acc04,$s1,8,0 + stb $s0,3($out) + extrwi $acc05,$s1,8,8 + stb $acc04,4($out) + extrwi $acc06,$s1,8,16 + stb $acc05,5($out) + stb $acc06,6($out) + extrwi $acc08,$s2,8,0 + stb $s1,7($out) + extrwi $acc09,$s2,8,8 + stb $acc08,8($out) + extrwi $acc10,$s2,8,16 + stb $acc09,9($out) + stb $acc10,10($out) + extrwi $acc12,$s3,8,0 + stb $s2,11($out) + extrwi $acc13,$s3,8,8 + stb $acc12,12($out) + extrwi $acc14,$s3,8,16 + stb $acc13,13($out) + stb $acc14,14($out) + stb $s3,15($out) - $POP r0,`$FRAME-$SIZE_T*21`($sp) +Lenc_done: + $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) @@ -388,18 +472,21 @@ $code.=<<___; mtlr r0 addi $sp,$sp,$FRAME blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 .align 5 Lppc_AES_encrypt: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,3 + lwz $t0,0($key) addi $Tbl2,$Tbl0,2 + lwz $t1,4($key) addi $Tbl3,$Tbl0,1 + lwz $t2,8($key) addi $acc00,$acc00,-1 + lwz $t3,12($key) addi $key,$key,16 xor $s0,$s0,$t0 xor $s1,$s1,$t1 @@ -413,44 +500,44 @@ Lenc_loop: rlwinm $acc02,$s2,`32-24+3`,21,28 rlwinm $acc03,$s3,`32-24+3`,21,28 lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc04,$s1,`32-16+3`,21,28 + lwz $t1,4($key) rlwinm $acc05,$s2,`32-16+3`,21,28 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc06,$s3,`32-16+3`,21,28 + lwz $t3,12($key) rlwinm $acc07,$s0,`32-16+3`,21,28 lwzx $acc00,$Tbl0,$acc00 - lwzx $acc01,$Tbl0,$acc01 rlwinm $acc08,$s2,`32-8+3`,21,28 + lwzx $acc01,$Tbl0,$acc01 rlwinm $acc09,$s3,`32-8+3`,21,28 lwzx $acc02,$Tbl0,$acc02 - lwzx $acc03,$Tbl0,$acc03 rlwinm $acc10,$s0,`32-8+3`,21,28 + lwzx $acc03,$Tbl0,$acc03 rlwinm $acc11,$s1,`32-8+3`,21,28 lwzx $acc04,$Tbl1,$acc04 - lwzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s3,`0+3`,21,28 + lwzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s0,`0+3`,21,28 lwzx $acc06,$Tbl1,$acc06 - lwzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s1,`0+3`,21,28 + lwzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s2,`0+3`,21,28 lwzx $acc08,$Tbl2,$acc08 - lwzx $acc09,$Tbl2,$acc09 xor $t0,$t0,$acc00 + lwzx $acc09,$Tbl2,$acc09 xor $t1,$t1,$acc01 lwzx $acc10,$Tbl2,$acc10 - lwzx $acc11,$Tbl2,$acc11 xor $t2,$t2,$acc02 + lwzx $acc11,$Tbl2,$acc11 xor $t3,$t3,$acc03 lwzx $acc12,$Tbl3,$acc12 - lwzx $acc13,$Tbl3,$acc13 xor $t0,$t0,$acc04 + lwzx $acc13,$Tbl3,$acc13 xor $t1,$t1,$acc05 lwzx $acc14,$Tbl3,$acc14 - lwzx $acc15,$Tbl3,$acc15 xor $t2,$t2,$acc06 + lwzx $acc15,$Tbl3,$acc15 xor $t3,$t3,$acc07 xor $t0,$t0,$acc08 xor $t1,$t1,$acc09 @@ -466,60 +553,60 @@ Lenc_loop: addi $Tbl2,$Tbl0,2048 nop lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc00,$s0,`32-24`,24,31 + lwz $t1,4($key) rlwinm $acc01,$s1,`32-24`,24,31 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc02,$s2,`32-24`,24,31 + lwz $t3,12($key) rlwinm $acc03,$s3,`32-24`,24,31 lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4 - lwz $acc09,`2048+32`($Tbl0) rlwinm $acc04,$s1,`32-16`,24,31 + lwz $acc09,`2048+32`($Tbl0) rlwinm $acc05,$s2,`32-16`,24,31 lwz $acc10,`2048+64`($Tbl0) - lwz $acc11,`2048+96`($Tbl0) rlwinm $acc06,$s3,`32-16`,24,31 + lwz $acc11,`2048+96`($Tbl0) rlwinm $acc07,$s0,`32-16`,24,31 lwz $acc12,`2048+128`($Tbl0) - lwz $acc13,`2048+160`($Tbl0) rlwinm $acc08,$s2,`32-8`,24,31 + lwz $acc13,`2048+160`($Tbl0) rlwinm $acc09,$s3,`32-8`,24,31 lwz $acc14,`2048+192`($Tbl0) - lwz $acc15,`2048+224`($Tbl0) rlwinm $acc10,$s0,`32-8`,24,31 + lwz $acc15,`2048+224`($Tbl0) rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc00,$Tbl2,$acc00 - lbzx $acc01,$Tbl2,$acc01 rlwinm $acc12,$s3,`0`,24,31 + lbzx $acc01,$Tbl2,$acc01 rlwinm $acc13,$s0,`0`,24,31 lbzx $acc02,$Tbl2,$acc02 - lbzx $acc03,$Tbl2,$acc03 rlwinm $acc14,$s1,`0`,24,31 + lbzx $acc03,$Tbl2,$acc03 rlwinm $acc15,$s2,`0`,24,31 lbzx $acc04,$Tbl2,$acc04 - lbzx $acc05,$Tbl2,$acc05 rlwinm $s0,$acc00,24,0,7 + lbzx $acc05,$Tbl2,$acc05 rlwinm $s1,$acc01,24,0,7 lbzx $acc06,$Tbl2,$acc06 - lbzx $acc07,$Tbl2,$acc07 rlwinm $s2,$acc02,24,0,7 + lbzx $acc07,$Tbl2,$acc07 rlwinm $s3,$acc03,24,0,7 lbzx $acc08,$Tbl2,$acc08 - lbzx $acc09,$Tbl2,$acc09 rlwimi $s0,$acc04,16,8,15 + lbzx $acc09,$Tbl2,$acc09 rlwimi $s1,$acc05,16,8,15 lbzx $acc10,$Tbl2,$acc10 - lbzx $acc11,$Tbl2,$acc11 rlwimi $s2,$acc06,16,8,15 + lbzx $acc11,$Tbl2,$acc11 rlwimi $s3,$acc07,16,8,15 lbzx $acc12,$Tbl2,$acc12 - lbzx $acc13,$Tbl2,$acc13 rlwimi $s0,$acc08,8,16,23 + lbzx $acc13,$Tbl2,$acc13 rlwimi $s1,$acc09,8,16,23 lbzx $acc14,$Tbl2,$acc14 - lbzx $acc15,$Tbl2,$acc15 rlwimi $s2,$acc10,8,16,23 + lbzx $acc15,$Tbl2,$acc15 rlwimi $s3,$acc11,8,16,23 or $s0,$s0,$acc12 or $s1,$s1,$acc13 @@ -530,29 +617,31 @@ Lenc_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_encrypt_compact: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,2048 + lwz $t0,0($key) lis $mask80,0x8080 + lwz $t1,4($key) lis $mask1b,0x1b1b - addi $key,$key,16 + lwz $t2,8($key) ori $mask80,$mask80,0x8080 + lwz $t3,12($key) ori $mask1b,$mask1b,0x1b1b + addi $key,$key,16 mtctr $acc00 .align 4 Lenc_compact_loop: xor $s0,$s0,$t0 xor $s1,$s1,$t1 - xor $s2,$s2,$t2 - xor $s3,$s3,$t3 rlwinm $acc00,$s0,`32-24`,24,31 + xor $s2,$s2,$t2 rlwinm $acc01,$s1,`32-24`,24,31 + xor $s3,$s3,$t3 rlwinm $acc02,$s2,`32-24`,24,31 rlwinm $acc03,$s3,`32-24`,24,31 rlwinm $acc04,$s1,`32-16`,24,31 @@ -560,48 +649,48 @@ Lenc_compact_loop: rlwinm $acc06,$s3,`32-16`,24,31 rlwinm $acc07,$s0,`32-16`,24,31 lbzx $acc00,$Tbl1,$acc00 - lbzx $acc01,$Tbl1,$acc01 rlwinm $acc08,$s2,`32-8`,24,31 + lbzx $acc01,$Tbl1,$acc01 rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl1,$acc02 - lbzx $acc03,$Tbl1,$acc03 rlwinm $acc10,$s0,`32-8`,24,31 + lbzx $acc03,$Tbl1,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl1,$acc04 - lbzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s3,`0`,24,31 + lbzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s0,`0`,24,31 lbzx $acc06,$Tbl1,$acc06 - lbzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s1,`0`,24,31 + lbzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s2,`0`,24,31 lbzx $acc08,$Tbl1,$acc08 - lbzx $acc09,$Tbl1,$acc09 rlwinm $s0,$acc00,24,0,7 + lbzx $acc09,$Tbl1,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl1,$acc10 - lbzx $acc11,$Tbl1,$acc11 rlwinm $s2,$acc02,24,0,7 + lbzx $acc11,$Tbl1,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl1,$acc12 - lbzx $acc13,$Tbl1,$acc13 rlwimi $s0,$acc04,16,8,15 + lbzx $acc13,$Tbl1,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl1,$acc14 - lbzx $acc15,$Tbl1,$acc15 rlwimi $s2,$acc06,16,8,15 + lbzx $acc15,$Tbl1,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 lwz $t0,0($key) - lwz $t1,4($key) or $s0,$s0,$acc12 + lwz $t1,4($key) or $s1,$s1,$acc13 lwz $t2,8($key) - lwz $t3,12($key) or $s2,$s2,$acc14 + lwz $t3,12($key) or $s3,$s3,$acc15 addi $key,$key,16 @@ -612,12 +701,12 @@ Lenc_compact_loop: and $acc02,$s2,$mask80 and $acc03,$s3,$mask80 srwi $acc04,$acc00,7 # r1>>7 - srwi $acc05,$acc01,7 - srwi $acc06,$acc02,7 - srwi $acc07,$acc03,7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + srwi $acc05,$acc01,7 andc $acc09,$s1,$mask80 + srwi $acc06,$acc02,7 andc $acc10,$s2,$mask80 + srwi $acc07,$acc03,7 andc $acc11,$s3,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc01,$acc01,$acc05 @@ -633,32 +722,32 @@ Lenc_compact_loop: and $acc03,$acc03,$mask1b xor $acc00,$acc00,$acc08 # r2 xor $acc01,$acc01,$acc09 + rotlwi $acc12,$s0,16 # ROTATE(r0,16) xor $acc02,$acc02,$acc10 + rotlwi $acc13,$s1,16 xor $acc03,$acc03,$acc11 + rotlwi $acc14,$s2,16 - rotlwi $acc12,$s0,16 # ROTATE(r0,16) - rotlwi $acc13,$s1,16 - rotlwi $acc14,$s2,16 - rotlwi $acc15,$s3,16 xor $s0,$s0,$acc00 # r0^r2 + rotlwi $acc15,$s3,16 xor $s1,$s1,$acc01 - xor $s2,$s2,$acc02 - xor $s3,$s3,$acc03 rotrwi $s0,$s0,24 # ROTATE(r2^r0,24) + xor $s2,$s2,$acc02 rotrwi $s1,$s1,24 + xor $s3,$s3,$acc03 rotrwi $s2,$s2,24 - rotrwi $s3,$s3,24 xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2 + rotrwi $s3,$s3,24 xor $s1,$s1,$acc01 xor $s2,$s2,$acc02 xor $s3,$s3,$acc03 rotlwi $acc08,$acc12,8 # ROTATE(r0,24) - rotlwi $acc09,$acc13,8 - rotlwi $acc10,$acc14,8 - rotlwi $acc11,$acc15,8 xor $s0,$s0,$acc12 # + rotlwi $acc09,$acc13,8 xor $s1,$s1,$acc13 + rotlwi $acc10,$acc14,8 xor $s2,$s2,$acc14 + rotlwi $acc11,$acc15,8 xor $s3,$s3,$acc15 xor $s0,$s0,$acc08 # xor $s1,$s1,$acc09 @@ -673,14 +762,15 @@ Lenc_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .AES_decrypt .align 7 .AES_decrypt: - mflr r0 $STU $sp,-$FRAME($sp) + mflr r0 - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -701,7 +791,14 @@ Lenc_compact_done: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) + andi. $t0,$inp,3 + andi. $t1,$out,3 + or. $t0,$t0,$t1 + bne Ldec_unaligned + +Ldec_unaligned_ok: lwz $s0,0($inp) lwz $s1,4($inp) lwz $s2,8($inp) @@ -712,8 +809,80 @@ Lenc_compact_done: stw $s1,4($out) stw $s2,8($out) stw $s3,12($out) + b Ldec_done + +Ldec_unaligned: + subfic $t0,$inp,4096 + subfic $t1,$out,4096 + andi. $t0,$t0,4096-16 + beq Ldec_xpage + andi. $t1,$t1,4096-16 + bne Ldec_unaligned_ok + +Ldec_xpage: + lbz $acc00,0($inp) + lbz $acc01,1($inp) + lbz $acc02,2($inp) + lbz $s0,3($inp) + lbz $acc04,4($inp) + lbz $acc05,5($inp) + lbz $acc06,6($inp) + lbz $s1,7($inp) + lbz $acc08,8($inp) + lbz $acc09,9($inp) + lbz $acc10,10($inp) + insrwi $s0,$acc00,8,0 + lbz $s2,11($inp) + insrwi $s1,$acc04,8,0 + lbz $acc12,12($inp) + insrwi $s0,$acc01,8,8 + lbz $acc13,13($inp) + insrwi $s1,$acc05,8,8 + lbz $acc14,14($inp) + insrwi $s0,$acc02,8,16 + lbz $s3,15($inp) + insrwi $s1,$acc06,8,16 + insrwi $s2,$acc08,8,0 + insrwi $s3,$acc12,8,0 + insrwi $s2,$acc09,8,8 + insrwi $s3,$acc13,8,8 + insrwi $s2,$acc10,8,16 + insrwi $s3,$acc14,8,16 + + bl LAES_Td + bl Lppc_AES_decrypt_compact - $POP r0,`$FRAME-$SIZE_T*21`($sp) + extrwi $acc00,$s0,8,0 + extrwi $acc01,$s0,8,8 + stb $acc00,0($out) + extrwi $acc02,$s0,8,16 + stb $acc01,1($out) + stb $acc02,2($out) + extrwi $acc04,$s1,8,0 + stb $s0,3($out) + extrwi $acc05,$s1,8,8 + stb $acc04,4($out) + extrwi $acc06,$s1,8,16 + stb $acc05,5($out) + stb $acc06,6($out) + extrwi $acc08,$s2,8,0 + stb $s1,7($out) + extrwi $acc09,$s2,8,8 + stb $acc08,8($out) + extrwi $acc10,$s2,8,16 + stb $acc09,9($out) + stb $acc10,10($out) + extrwi $acc12,$s3,8,0 + stb $s2,11($out) + extrwi $acc13,$s3,8,8 + stb $acc12,12($out) + extrwi $acc14,$s3,8,16 + stb $acc13,13($out) + stb $acc14,14($out) + stb $s3,15($out) + +Ldec_done: + $POP r0,`$FRAME+$LRSAVE`($sp) $POP $toc,`$FRAME-$SIZE_T*20`($sp) $POP r13,`$FRAME-$SIZE_T*19`($sp) $POP r14,`$FRAME-$SIZE_T*18`($sp) @@ -737,18 +906,21 @@ Lenc_compact_done: mtlr r0 addi $sp,$sp,$FRAME blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 .align 5 Lppc_AES_decrypt: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,3 + lwz $t0,0($key) addi $Tbl2,$Tbl0,2 + lwz $t1,4($key) addi $Tbl3,$Tbl0,1 + lwz $t2,8($key) addi $acc00,$acc00,-1 + lwz $t3,12($key) addi $key,$key,16 xor $s0,$s0,$t0 xor $s1,$s1,$t1 @@ -762,44 +934,44 @@ Ldec_loop: rlwinm $acc02,$s2,`32-24+3`,21,28 rlwinm $acc03,$s3,`32-24+3`,21,28 lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc04,$s3,`32-16+3`,21,28 + lwz $t1,4($key) rlwinm $acc05,$s0,`32-16+3`,21,28 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc06,$s1,`32-16+3`,21,28 + lwz $t3,12($key) rlwinm $acc07,$s2,`32-16+3`,21,28 lwzx $acc00,$Tbl0,$acc00 - lwzx $acc01,$Tbl0,$acc01 rlwinm $acc08,$s2,`32-8+3`,21,28 + lwzx $acc01,$Tbl0,$acc01 rlwinm $acc09,$s3,`32-8+3`,21,28 lwzx $acc02,$Tbl0,$acc02 - lwzx $acc03,$Tbl0,$acc03 rlwinm $acc10,$s0,`32-8+3`,21,28 + lwzx $acc03,$Tbl0,$acc03 rlwinm $acc11,$s1,`32-8+3`,21,28 lwzx $acc04,$Tbl1,$acc04 - lwzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s1,`0+3`,21,28 + lwzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s2,`0+3`,21,28 lwzx $acc06,$Tbl1,$acc06 - lwzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s3,`0+3`,21,28 + lwzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s0,`0+3`,21,28 lwzx $acc08,$Tbl2,$acc08 - lwzx $acc09,$Tbl2,$acc09 xor $t0,$t0,$acc00 + lwzx $acc09,$Tbl2,$acc09 xor $t1,$t1,$acc01 lwzx $acc10,$Tbl2,$acc10 - lwzx $acc11,$Tbl2,$acc11 xor $t2,$t2,$acc02 + lwzx $acc11,$Tbl2,$acc11 xor $t3,$t3,$acc03 lwzx $acc12,$Tbl3,$acc12 - lwzx $acc13,$Tbl3,$acc13 xor $t0,$t0,$acc04 + lwzx $acc13,$Tbl3,$acc13 xor $t1,$t1,$acc05 lwzx $acc14,$Tbl3,$acc14 - lwzx $acc15,$Tbl3,$acc15 xor $t2,$t2,$acc06 + lwzx $acc15,$Tbl3,$acc15 xor $t3,$t3,$acc07 xor $t0,$t0,$acc08 xor $t1,$t1,$acc09 @@ -815,56 +987,56 @@ Ldec_loop: addi $Tbl2,$Tbl0,2048 nop lwz $t0,0($key) - lwz $t1,4($key) rlwinm $acc00,$s0,`32-24`,24,31 + lwz $t1,4($key) rlwinm $acc01,$s1,`32-24`,24,31 lwz $t2,8($key) - lwz $t3,12($key) rlwinm $acc02,$s2,`32-24`,24,31 + lwz $t3,12($key) rlwinm $acc03,$s3,`32-24`,24,31 lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4 - lwz $acc09,`2048+32`($Tbl0) rlwinm $acc04,$s3,`32-16`,24,31 + lwz $acc09,`2048+32`($Tbl0) rlwinm $acc05,$s0,`32-16`,24,31 lwz $acc10,`2048+64`($Tbl0) - lwz $acc11,`2048+96`($Tbl0) lbzx $acc00,$Tbl2,$acc00 + lwz $acc11,`2048+96`($Tbl0) lbzx $acc01,$Tbl2,$acc01 lwz $acc12,`2048+128`($Tbl0) - lwz $acc13,`2048+160`($Tbl0) rlwinm $acc06,$s1,`32-16`,24,31 + lwz $acc13,`2048+160`($Tbl0) rlwinm $acc07,$s2,`32-16`,24,31 lwz $acc14,`2048+192`($Tbl0) - lwz $acc15,`2048+224`($Tbl0) rlwinm $acc08,$s2,`32-8`,24,31 + lwz $acc15,`2048+224`($Tbl0) rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl2,$acc02 - lbzx $acc03,$Tbl2,$acc03 rlwinm $acc10,$s0,`32-8`,24,31 + lbzx $acc03,$Tbl2,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl2,$acc04 - lbzx $acc05,$Tbl2,$acc05 rlwinm $acc12,$s1,`0`,24,31 + lbzx $acc05,$Tbl2,$acc05 rlwinm $acc13,$s2,`0`,24,31 lbzx $acc06,$Tbl2,$acc06 - lbzx $acc07,$Tbl2,$acc07 rlwinm $acc14,$s3,`0`,24,31 + lbzx $acc07,$Tbl2,$acc07 rlwinm $acc15,$s0,`0`,24,31 lbzx $acc08,$Tbl2,$acc08 - lbzx $acc09,$Tbl2,$acc09 rlwinm $s0,$acc00,24,0,7 + lbzx $acc09,$Tbl2,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl2,$acc10 - lbzx $acc11,$Tbl2,$acc11 rlwinm $s2,$acc02,24,0,7 + lbzx $acc11,$Tbl2,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl2,$acc12 - lbzx $acc13,$Tbl2,$acc13 rlwimi $s0,$acc04,16,8,15 + lbzx $acc13,$Tbl2,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl2,$acc14 - lbzx $acc15,$Tbl2,$acc15 rlwimi $s2,$acc06,16,8,15 + lbzx $acc15,$Tbl2,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 @@ -879,20 +1051,22 @@ Ldec_loop: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .align 4 Lppc_AES_decrypt_compact: lwz $acc00,240($key) - lwz $t0,0($key) - lwz $t1,4($key) - lwz $t2,8($key) - lwz $t3,12($key) addi $Tbl1,$Tbl0,2048 + lwz $t0,0($key) lis $mask80,0x8080 + lwz $t1,4($key) lis $mask1b,0x1b1b - addi $key,$key,16 + lwz $t2,8($key) ori $mask80,$mask80,0x8080 + lwz $t3,12($key) ori $mask1b,$mask1b,0x1b1b + addi $key,$key,16 ___ $code.=<<___ if ($SIZE_T==8); insrdi $mask80,$mask80,32,0 @@ -904,10 +1078,10 @@ $code.=<<___; Ldec_compact_loop: xor $s0,$s0,$t0 xor $s1,$s1,$t1 - xor $s2,$s2,$t2 - xor $s3,$s3,$t3 rlwinm $acc00,$s0,`32-24`,24,31 + xor $s2,$s2,$t2 rlwinm $acc01,$s1,`32-24`,24,31 + xor $s3,$s3,$t3 rlwinm $acc02,$s2,`32-24`,24,31 rlwinm $acc03,$s3,`32-24`,24,31 rlwinm $acc04,$s3,`32-16`,24,31 @@ -915,48 +1089,48 @@ Ldec_compact_loop: rlwinm $acc06,$s1,`32-16`,24,31 rlwinm $acc07,$s2,`32-16`,24,31 lbzx $acc00,$Tbl1,$acc00 - lbzx $acc01,$Tbl1,$acc01 rlwinm $acc08,$s2,`32-8`,24,31 + lbzx $acc01,$Tbl1,$acc01 rlwinm $acc09,$s3,`32-8`,24,31 lbzx $acc02,$Tbl1,$acc02 - lbzx $acc03,$Tbl1,$acc03 rlwinm $acc10,$s0,`32-8`,24,31 + lbzx $acc03,$Tbl1,$acc03 rlwinm $acc11,$s1,`32-8`,24,31 lbzx $acc04,$Tbl1,$acc04 - lbzx $acc05,$Tbl1,$acc05 rlwinm $acc12,$s1,`0`,24,31 + lbzx $acc05,$Tbl1,$acc05 rlwinm $acc13,$s2,`0`,24,31 lbzx $acc06,$Tbl1,$acc06 - lbzx $acc07,$Tbl1,$acc07 rlwinm $acc14,$s3,`0`,24,31 + lbzx $acc07,$Tbl1,$acc07 rlwinm $acc15,$s0,`0`,24,31 lbzx $acc08,$Tbl1,$acc08 - lbzx $acc09,$Tbl1,$acc09 rlwinm $s0,$acc00,24,0,7 + lbzx $acc09,$Tbl1,$acc09 rlwinm $s1,$acc01,24,0,7 lbzx $acc10,$Tbl1,$acc10 - lbzx $acc11,$Tbl1,$acc11 rlwinm $s2,$acc02,24,0,7 + lbzx $acc11,$Tbl1,$acc11 rlwinm $s3,$acc03,24,0,7 lbzx $acc12,$Tbl1,$acc12 - lbzx $acc13,$Tbl1,$acc13 rlwimi $s0,$acc04,16,8,15 + lbzx $acc13,$Tbl1,$acc13 rlwimi $s1,$acc05,16,8,15 lbzx $acc14,$Tbl1,$acc14 - lbzx $acc15,$Tbl1,$acc15 rlwimi $s2,$acc06,16,8,15 + lbzx $acc15,$Tbl1,$acc15 rlwimi $s3,$acc07,16,8,15 rlwimi $s0,$acc08,8,16,23 rlwimi $s1,$acc09,8,16,23 rlwimi $s2,$acc10,8,16,23 rlwimi $s3,$acc11,8,16,23 lwz $t0,0($key) - lwz $t1,4($key) or $s0,$s0,$acc12 + lwz $t1,4($key) or $s1,$s1,$acc13 lwz $t2,8($key) - lwz $t3,12($key) or $s2,$s2,$acc14 + lwz $t3,12($key) or $s3,$s3,$acc15 addi $key,$key,16 @@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4); and $acc02,$s2,$mask80 and $acc03,$s3,$mask80 srwi $acc04,$acc00,7 # r1>>7 - srwi $acc05,$acc01,7 - srwi $acc06,$acc02,7 - srwi $acc07,$acc03,7 andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f + srwi $acc05,$acc01,7 andc $acc09,$s1,$mask80 + srwi $acc06,$acc02,7 andc $acc10,$s2,$mask80 + srwi $acc07,$acc03,7 andc $acc11,$s3,$mask80 sub $acc00,$acc00,$acc04 # r1-(r1>>7) sub $acc01,$acc01,$acc05 @@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4); and $acc06,$acc02,$mask80 and $acc07,$acc03,$mask80 srwi $acc08,$acc04,7 # r1>>7 - srwi $acc09,$acc05,7 - srwi $acc10,$acc06,7 - srwi $acc11,$acc07,7 andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f + srwi $acc09,$acc05,7 andc $acc13,$acc01,$mask80 + srwi $acc10,$acc06,7 andc $acc14,$acc02,$mask80 + srwi $acc11,$acc07,7 andc $acc15,$acc03,$mask80 sub $acc04,$acc04,$acc08 # r1-(r1>>7) sub $acc05,$acc05,$acc09 @@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4); and $acc08,$acc04,$mask80 # r1=r4&0x80808080 and $acc09,$acc05,$mask80 - and $acc10,$acc06,$mask80 - and $acc11,$acc07,$mask80 srwi $acc12,$acc08,7 # r1>>7 + and $acc10,$acc06,$mask80 srwi $acc13,$acc09,7 + and $acc11,$acc07,$mask80 srwi $acc14,$acc10,7 - srwi $acc15,$acc11,7 sub $acc08,$acc08,$acc12 # r1-(r1>>7) + srwi $acc15,$acc11,7 sub $acc09,$acc09,$acc13 sub $acc10,$acc10,$acc14 sub $acc11,$acc11,$acc15 @@ -1124,10 +1298,10 @@ ___ $code.=<<___; rotrwi $s0,$s0,8 # = ROTATE(r0,8) rotrwi $s1,$s1,8 - rotrwi $s2,$s2,8 - rotrwi $s3,$s3,8 xor $s0,$s0,$acc00 # ^= r2^r0 + rotrwi $s2,$s2,8 xor $s1,$s1,$acc01 + rotrwi $s3,$s3,8 xor $s2,$s2,$acc02 xor $s3,$s3,$acc03 xor $acc00,$acc00,$acc08 @@ -1135,32 +1309,32 @@ $code.=<<___; xor $acc02,$acc02,$acc10 xor $acc03,$acc03,$acc11 xor $s0,$s0,$acc04 # ^= r4^r0 - xor $s1,$s1,$acc05 - xor $s2,$s2,$acc06 - xor $s3,$s3,$acc07 rotrwi $acc00,$acc00,24 + xor $s1,$s1,$acc05 rotrwi $acc01,$acc01,24 + xor $s2,$s2,$acc06 rotrwi $acc02,$acc02,24 + xor $s3,$s3,$acc07 rotrwi $acc03,$acc03,24 xor $acc04,$acc04,$acc08 xor $acc05,$acc05,$acc09 xor $acc06,$acc06,$acc10 xor $acc07,$acc07,$acc11 xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)] - xor $s1,$s1,$acc09 - xor $s2,$s2,$acc10 - xor $s3,$s3,$acc11 rotrwi $acc04,$acc04,16 + xor $s1,$s1,$acc09 rotrwi $acc05,$acc05,16 + xor $s2,$s2,$acc10 rotrwi $acc06,$acc06,16 + xor $s3,$s3,$acc11 rotrwi $acc07,$acc07,16 xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24) - xor $s1,$s1,$acc01 - xor $s2,$s2,$acc02 - xor $s3,$s3,$acc03 rotrwi $acc08,$acc08,8 + xor $s1,$s1,$acc01 rotrwi $acc09,$acc09,8 + xor $s2,$s2,$acc02 rotrwi $acc10,$acc10,8 + xor $s3,$s3,$acc03 rotrwi $acc11,$acc11,8 xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16) xor $s1,$s1,$acc05 @@ -1179,7 +1353,9 @@ Ldec_compact_done: xor $s2,$s2,$t2 xor $s3,$s3,$t3 blr -.long 0 + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .asciz "AES for PPC, CRYPTOGAMS by " .align 7 ___ diff --git a/src/lib/libcrypto/aes/asm/aes-s390x.pl b/src/lib/libcrypto/aes/asm/aes-s390x.pl index 7e01889298..445a1e6762 100644 --- a/src/lib/libcrypto/aes/asm/aes-s390x.pl +++ b/src/lib/libcrypto/aes/asm/aes-s390x.pl @@ -44,12 +44,57 @@ # Unlike previous version hardware support detection takes place only # at the moment of key schedule setup, which is denoted in key->rounds. # This is done, because deferred key setup can't be made MT-safe, not -# for key lengthes longer than 128 bits. +# for keys longer than 128 bits. # # Add AES_cbc_encrypt, which gives incredible performance improvement, # it was measured to be ~6.6x. It's less than previously mentioned 8x, # because software implementation was optimized. +# May 2010. +# +# Add AES_ctr32_encrypt. If hardware-assisted, it provides up to 4.3x +# performance improvement over "generic" counter mode routine relying +# on single-block, also hardware-assisted, AES_encrypt. "Up to" refers +# to the fact that exact throughput value depends on current stack +# frame alignment within 4KB page. In worst case you get ~75% of the +# maximum, but *on average* it would be as much as ~98%. Meaning that +# worst case is unlike, it's like hitting ravine on plateau. + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2x better than code generated by gcc 4.3. + +# December 2010. +# +# Add support for z196 "cipher message with counter" instruction. +# Note however that it's disengaged, because it was measured to +# perform ~12% worse than vanilla km-based code... + +# February 2011. +# +# Add AES_xts_[en|de]crypt. This includes support for z196 km-xts-aes +# instructions, which deliver ~70% improvement at 8KB block size over +# vanilla km-based code, 37% - at most like 512-bytes block size. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + $softonly=0; # allow hardware support $t0="%r0"; $mask="%r0"; @@ -69,6 +114,8 @@ $rounds="%r13"; $ra="%r14"; $sp="%r15"; +$stdframe=16*$SIZE_T+4*8; + sub _data_word() { my $i; while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } @@ -210,7 +257,7 @@ $code.=<<___ if (!$softonly); .Lesoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -220,20 +267,20 @@ $code.=<<___; larl $tbl,AES_Te bras $ra,_s390x_AES_encrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_encrypt,.-AES_encrypt .type _s390x_AES_encrypt,\@function .align 16 _s390x_AES_encrypt: - stg $ra,152($sp) + st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -397,7 +444,7 @@ _s390x_AES_encrypt: or $s2,$i3 or $s3,$t3 - lg $ra,152($sp) + l${g} $ra,15*$SIZE_T($sp) xr $s0,$t0 xr $s1,$t2 x $s2,24($key) @@ -536,7 +583,7 @@ $code.=<<___ if (!$softonly); .Ldsoft: ___ $code.=<<___; - stmg %r3,$ra,24($sp) + stm${g} %r3,$ra,3*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) @@ -546,20 +593,20 @@ $code.=<<___; larl $tbl,AES_Td bras $ra,_s390x_AES_decrypt - lg $out,24($sp) + l${g} $out,3*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) st $s3,12($out) - lmg %r6,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) br $ra .size AES_decrypt,.-AES_decrypt .type _s390x_AES_decrypt,\@function .align 16 _s390x_AES_decrypt: - stg $ra,152($sp) + st${g} $ra,15*$SIZE_T($sp) x $s0,0($key) x $s1,4($key) x $s2,8($key) @@ -703,7 +750,7 @@ _s390x_AES_decrypt: nr $i1,$mask nr $i2,$mask - lg $ra,152($sp) + l${g} $ra,15*$SIZE_T($sp) or $s1,$t1 l $t0,16($key) l $t1,20($key) @@ -732,14 +779,15 @@ ___ $code.=<<___; # void AES_set_encrypt_key(const unsigned char *in, int bits, # AES_KEY *key) { -.globl AES_set_encrypt_key -.type AES_set_encrypt_key,\@function +.globl private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,\@function .align 16 -AES_set_encrypt_key: +private_AES_set_encrypt_key: +_s390x_AES_set_encrypt_key: lghi $t0,0 - clgr $inp,$t0 + cl${g}r $inp,$t0 je .Lminus1 - clgr $key,$t0 + cl${g}r $key,$t0 je .Lminus1 lghi $t0,128 @@ -789,7 +837,8 @@ $code.=<<___ if (!$softonly); je 1f lg %r1,24($inp) stg %r1,24($key) -1: st $bits,236($key) # save bits +1: st $bits,236($key) # save bits [for debugging purposes] + lgr $t0,%r5 st %r5,240($key) # save km code lghi %r2,0 br %r14 @@ -797,7 +846,7 @@ ___ $code.=<<___; .align 16 .Lekey_internal: - stmg %r6,%r13,48($sp) # all non-volatile regs + stm${g} %r4,%r13,4*$SIZE_T($sp) # all non-volatile regs and $key larl $tbl,AES_Te+2048 @@ -857,8 +906,9 @@ $code.=<<___; la $key,16($key) # key+=4 la $t3,4($t3) # i++ brct $rounds,.L128_loop + lghi $t0,10 lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r4,%r13,4*$SIZE_T($sp) br $ra .align 16 @@ -905,8 +955,9 @@ $code.=<<___; st $s2,32($key) st $s3,36($key) brct $rounds,.L192_continue + lghi $t0,12 lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r4,%r13,4*$SIZE_T($sp) br $ra .align 16 @@ -967,8 +1018,9 @@ $code.=<<___; st $s2,40($key) st $s3,44($key) brct $rounds,.L256_continue + lghi $t0,14 lghi %r2,0 - lmg %r6,%r13,48($sp) + lm${g} %r4,%r13,4*$SIZE_T($sp) br $ra .align 16 @@ -1011,42 +1063,34 @@ $code.=<<___; .Lminus1: lghi %r2,-1 br $ra -.size AES_set_encrypt_key,.-AES_set_encrypt_key +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key # void AES_set_decrypt_key(const unsigned char *in, int bits, # AES_KEY *key) { -.globl AES_set_decrypt_key -.type AES_set_decrypt_key,\@function +.globl private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,\@function .align 16 -AES_set_decrypt_key: - stg $key,32($sp) # I rely on AES_set_encrypt_key to - stg $ra,112($sp) # save non-volatile registers! - bras $ra,AES_set_encrypt_key - lg $key,32($sp) - lg $ra,112($sp) +private_AES_set_decrypt_key: + #st${g} $key,4*$SIZE_T($sp) # I rely on AES_set_encrypt_key to + st${g} $ra,14*$SIZE_T($sp) # save non-volatile registers and $key! + bras $ra,_s390x_AES_set_encrypt_key + #l${g} $key,4*$SIZE_T($sp) + l${g} $ra,14*$SIZE_T($sp) ltgr %r2,%r2 bnzr $ra ___ $code.=<<___ if (!$softonly); - l $t0,240($key) + #l $t0,240($key) lhi $t1,16 cr $t0,$t1 jl .Lgo oill $t0,0x80 # set "decrypt" bit st $t0,240($key) br $ra - -.align 16 -.Ldkey_internal: - stg $key,32($sp) - stg $ra,40($sp) - bras $ra,.Lekey_internal - lg $key,32($sp) - lg $ra,40($sp) ___ $code.=<<___; - -.Lgo: llgf $rounds,240($key) +.align 16 +.Lgo: lgr $rounds,$t0 #llgf $rounds,240($key) la $i1,0($key) sllg $i2,$rounds,4 la $i2,0($i2,$key) @@ -1123,13 +1167,14 @@ $code.=<<___; la $key,4($key) brct $rounds,.Lmix - lmg %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key! + lm${g} %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key! lghi %r2,0 br $ra -.size AES_set_decrypt_key,.-AES_set_decrypt_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key ___ -#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, +######################################################################## +# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out, # size_t length, const AES_KEY *key, # unsigned char *ivec, const int enc) { @@ -1163,7 +1208,7 @@ $code.=<<___ if (!$softonly); l %r0,240($key) # load kmc code lghi $key,15 # res=len%16, len-=res; ngr $key,$len - slgr $len,$key + sl${g}r $len,$key la %r1,16($sp) # parameter block - ivec || key jz .Lkmc_truncated .long 0xb92f0042 # kmc %r4,%r2 @@ -1181,34 +1226,34 @@ $code.=<<___ if (!$softonly); tmll %r0,0x80 jnz .Lkmc_truncated_dec lghi %r1,0 - stg %r1,128($sp) - stg %r1,136($sp) + stg %r1,16*$SIZE_T($sp) + stg %r1,16*$SIZE_T+8($sp) bras %r1,1f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 1: ex $key,0(%r1) la %r1,16($sp) # restore parameter block - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 j .Lkmc_done .align 16 .Lkmc_truncated_dec: - stg $out,64($sp) - la $out,128($sp) + st${g} $out,4*$SIZE_T($sp) + la $out,16*$SIZE_T($sp) lghi $len,16 .long 0xb92f0042 # kmc %r4,%r2 - lg $out,64($sp) + l${g} $out,4*$SIZE_T($sp) bras %r1,2f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 2: ex $key,0(%r1) j .Lkmc_done .align 16 .Lcbc_software: ___ $code.=<<___; - stmg $key,$ra,40($sp) + stm${g} $key,$ra,5*$SIZE_T($sp) lhi %r0,0 - cl %r0,164($sp) + cl %r0,`$stdframe+$SIZE_T-4`($sp) je .Lcbc_decrypt larl $tbl,AES_Te @@ -1219,10 +1264,10 @@ $code.=<<___; llgf $s3,12($ivp) lghi $t0,16 - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow .Lcbc_enc_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) x $s0,0($inp) x $s1,4($inp) x $s2,8($inp) @@ -1231,7 +1276,7 @@ $code.=<<___; bras $ra,_s390x_AES_encrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) st $s0,0($out) st $s1,4($out) st $s2,8($out) @@ -1240,33 +1285,33 @@ $code.=<<___; la $inp,16($inp) la $out,16($out) lghi $t0,16 - ltgr $len,$len + lt${g}r $len,$len jz .Lcbc_enc_done - slgr $len,$t0 + sl${g}r $len,$t0 brc 4,.Lcbc_enc_tail # if borrow j .Lcbc_enc_loop .align 16 .Lcbc_enc_done: - lg $ivp,48($sp) + l${g} $ivp,6*$SIZE_T($sp) st $s0,0($ivp) st $s1,4($ivp) st $s2,8($ivp) st $s3,12($ivp) - lmg %r7,$ra,56($sp) + lm${g} %r7,$ra,7*$SIZE_T($sp) br $ra .align 16 .Lcbc_enc_tail: aghi $len,15 lghi $t0,0 - stg $t0,128($sp) - stg $t0,136($sp) + stg $t0,16*$SIZE_T($sp) + stg $t0,16*$SIZE_T+8($sp) bras $t1,3f - mvc 128(1,$sp),0($inp) + mvc 16*$SIZE_T(1,$sp),0($inp) 3: ex $len,0($t1) lghi $len,0 - la $inp,128($sp) + la $inp,16*$SIZE_T($sp) j .Lcbc_enc_loop .align 16 @@ -1275,10 +1320,10 @@ $code.=<<___; lg $t0,0($ivp) lg $t1,8($ivp) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) .Lcbc_dec_loop: - stmg $inp,$out,16($sp) + stm${g} $inp,$out,2*$SIZE_T($sp) llgf $s0,0($inp) llgf $s1,4($inp) llgf $s2,8($inp) @@ -1287,7 +1332,7 @@ $code.=<<___; bras $ra,_s390x_AES_decrypt - lmg $inp,$key,16($sp) + lm${g} $inp,$key,2*$SIZE_T($sp) sllg $s0,$s0,32 sllg $s2,$s2,32 lr $s0,$s1 @@ -1295,15 +1340,15 @@ $code.=<<___; lg $t0,0($inp) lg $t1,8($inp) - xg $s0,128($sp) - xg $s2,136($sp) + xg $s0,16*$SIZE_T($sp) + xg $s2,16*$SIZE_T+8($sp) lghi $s1,16 - slgr $len,$s1 + sl${g}r $len,$s1 brc 4,.Lcbc_dec_tail # if borrow brc 2,.Lcbc_dec_done # if zero stg $s0,0($out) stg $s2,8($out) - stmg $t0,$t1,128($sp) + stmg $t0,$t1,16*$SIZE_T($sp) la $inp,16($inp) la $out,16($out) @@ -1313,7 +1358,7 @@ $code.=<<___; stg $s0,0($out) stg $s2,8($out) .Lcbc_dec_exit: - lmg $ivp,$ra,48($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) stmg $t0,$t1,0($ivp) br $ra @@ -1321,19 +1366,889 @@ $code.=<<___; .align 16 .Lcbc_dec_tail: aghi $len,15 - stg $s0,128($sp) - stg $s2,136($sp) + stg $s0,16*$SIZE_T($sp) + stg $s2,16*$SIZE_T+8($sp) bras $s1,4f - mvc 0(1,$out),128($sp) + mvc 0(1,$out),16*$SIZE_T($sp) 4: ex $len,0($s1) j .Lcbc_dec_exit .size AES_cbc_encrypt,.-AES_cbc_encrypt -.comm OPENSSL_s390xcap_P,8,8 +___ +} +######################################################################## +# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, +# size_t blocks, const AES_KEY *key, +# const unsigned char *ivec) +{ +my $inp="%r2"; +my $out="%r4"; # blocks and out are swapped +my $len="%r3"; +my $key="%r5"; my $iv0="%r5"; +my $ivp="%r6"; +my $fp ="%r7"; + +$code.=<<___; +.globl AES_ctr32_encrypt +.type AES_ctr32_encrypt,\@function +.align 16 +AES_ctr32_encrypt: + xgr %r3,%r4 # flip %r3 and %r4, $out and $len + xgr %r4,%r3 + xgr %r3,%r4 + llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case +___ +$code.=<<___ if (!$softonly); + l %r0,240($key) + lhi %r1,16 + clr %r0,%r1 + jl .Lctr32_software + + stm${g} %r6,$s3,6*$SIZE_T($sp) + + slgr $out,$inp + la %r1,0($key) # %r1 is permanent copy of $key + lg $iv0,0($ivp) # load ivec + lg $ivp,8($ivp) + + # prepare and allocate stack frame at the top of 4K page + # with 1K reserved for eventual signal handling + lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer + lghi $s1,-4096 + algr $s0,$sp + lgr $fp,$sp + ngr $s0,$s1 # align at page boundary + slgr $fp,$s0 # total buffer size + lgr $s2,$sp + lghi $s1,1024+16 # sl[g]fi is extended-immediate facility + slgr $fp,$s1 # deduct reservation to get usable buffer size + # buffer size is at lest 256 and at most 3072+256-16 + + la $sp,1024($s0) # alloca + srlg $fp,$fp,4 # convert bytes to blocks, minimum 16 + st${g} $s2,0($sp) # back-chain + st${g} $fp,$SIZE_T($sp) + + slgr $len,$fp + brc 1,.Lctr32_hw_switch # not zero, no borrow + algr $fp,$len # input is shorter than allocated buffer + lghi $len,0 + st${g} $fp,$SIZE_T($sp) + +.Lctr32_hw_switch: +___ +$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower + larl $s0,OPENSSL_s390xcap_P + lg $s0,8($s0) + tmhh $s0,0x0004 # check for message_security-assist-4 + jz .Lctr32_km_loop + + llgfr $s0,%r0 + lgr $s1,%r1 + lghi %r0,0 + la %r1,16($sp) + .long 0xb92d2042 # kmctr %r4,%r2,%r2 + + llihh %r0,0x8000 # check if kmctr supports the function code + srlg %r0,%r0,0($s0) + ng %r0,16($sp) + lgr %r0,$s0 + lgr %r1,$s1 + jz .Lctr32_km_loop + +####### kmctr code + algr $out,$inp # restore $out + lgr $s1,$len # $s1 undertakes $len + j .Lctr32_kmctr_loop +.align 16 +.Lctr32_kmctr_loop: + la $s2,16($sp) + lgr $s3,$fp +.Lctr32_kmctr_prepare: + stg $iv0,0($s2) + stg $ivp,8($s2) + la $s2,16($s2) + ahi $ivp,1 # 32-bit increment, preserves upper half + brct $s3,.Lctr32_kmctr_prepare + + #la $inp,0($inp) # inp + sllg $len,$fp,4 # len + #la $out,0($out) # out + la $s2,16($sp) # iv + .long 0xb92da042 # kmctr $out,$s2,$inp + brc 1,.-4 # pay attention to "partial completion" + + slgr $s1,$fp + brc 1,.Lctr32_kmctr_loop # not zero, no borrow + algr $fp,$s1 + lghi $s1,0 + brc 4+1,.Lctr32_kmctr_loop # not zero + + l${g} $sp,0($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +___ +$code.=<<___; +.Lctr32_km_loop: + la $s2,16($sp) + lgr $s3,$fp +.Lctr32_km_prepare: + stg $iv0,0($s2) + stg $ivp,8($s2) + la $s2,16($s2) + ahi $ivp,1 # 32-bit increment, preserves upper half + brct $s3,.Lctr32_km_prepare + + la $s0,16($sp) # inp + sllg $s1,$fp,4 # len + la $s2,16($sp) # out + .long 0xb92e00a8 # km %r10,%r8 + brc 1,.-4 # pay attention to "partial completion" + + la $s2,16($sp) + lgr $s3,$fp + slgr $s2,$inp +.Lctr32_km_xor: + lg $s0,0($inp) + lg $s1,8($inp) + xg $s0,0($s2,$inp) + xg $s1,8($s2,$inp) + stg $s0,0($out,$inp) + stg $s1,8($out,$inp) + la $inp,16($inp) + brct $s3,.Lctr32_km_xor + + slgr $len,$fp + brc 1,.Lctr32_km_loop # not zero, no borrow + algr $fp,$len + lghi $len,0 + brc 4+1,.Lctr32_km_loop # not zero + + l${g} $s0,0($sp) + l${g} $s1,$SIZE_T($sp) + la $s2,16($sp) +.Lctr32_km_zap: + stg $s0,0($s2) + stg $s0,8($s2) + la $s2,16($s2) + brct $s1,.Lctr32_km_zap + + la $sp,0($s0) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +.Lctr32_software: +___ +$code.=<<___; + stm${g} $key,$ra,5*$SIZE_T($sp) + sl${g}r $inp,$out + larl $tbl,AES_Te + llgf $t1,12($ivp) + +.Lctr32_loop: + stm${g} $inp,$out,2*$SIZE_T($sp) + llgf $s0,0($ivp) + llgf $s1,4($ivp) + llgf $s2,8($ivp) + lgr $s3,$t1 + st $t1,16*$SIZE_T($sp) + lgr %r4,$key + + bras $ra,_s390x_AES_encrypt + + lm${g} $inp,$ivp,2*$SIZE_T($sp) + llgf $t1,16*$SIZE_T($sp) + x $s0,0($inp,$out) + x $s1,4($inp,$out) + x $s2,8($inp,$out) + x $s3,12($inp,$out) + stm $s0,$s3,0($out) + + la $out,16($out) + ahi $t1,1 # 32-bit increment + brct $len,.Lctr32_loop + + lm${g} %r6,$ra,6*$SIZE_T($sp) + br $ra +.size AES_ctr32_encrypt,.-AES_ctr32_encrypt +___ +} + +######################################################################## +# void AES_xts_encrypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +{ +my $inp="%r2"; +my $out="%r4"; # len and out are swapped +my $len="%r3"; +my $key1="%r5"; # $i1 +my $key2="%r6"; # $i2 +my $fp="%r7"; # $i3 +my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame... + +$code.=<<___; +.type _s390x_xts_km,\@function +.align 16 +_s390x_xts_km: +___ +$code.=<<___ if(1); + llgfr $s0,%r0 # put aside the function code + lghi $s1,0x7f + nr $s1,%r0 + lghi %r0,0 # query capability vector + la %r1,2*$SIZE_T($sp) + .long 0xb92e0042 # km %r4,%r2 + llihh %r1,0x8000 + srlg %r1,%r1,32($s1) # check for 32+function code + ng %r1,2*$SIZE_T($sp) + lgr %r0,$s0 # restore the function code + la %r1,0($key1) # restore $key1 + jz .Lxts_km_vanilla + + lmg $i2,$i3,$tweak($sp) # put aside the tweak value + algr $out,$inp + + oill %r0,32 # switch to xts function code + aghi $s1,-18 # + sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16 + la %r1,$tweak-16($sp) + slgr %r1,$s1 # parameter block position + lmg $s0,$s3,0($key1) # load 256 bits of key material, + stmg $s0,$s3,0(%r1) # and copy it to parameter block. + # yes, it contains junk and overlaps + # with the tweak in 128-bit case. + # it's done to avoid conditional + # branch. + stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value + + .long 0xb92e0042 # km %r4,%r2 + brc 1,.-4 # pay attention to "partial completion" + + lrvg $s0,$tweak+0($sp) # load the last tweak + lrvg $s1,$tweak+8($sp) + stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key + + nill %r0,0xffdf # switch back to original function code + la %r1,0($key1) # restore pointer to $key1 + slgr $out,$inp + + llgc $len,2*$SIZE_T-1($sp) + nill $len,0x0f # $len%=16 + br $ra + +.align 16 +.Lxts_km_vanilla: +___ +$code.=<<___; + # prepare and allocate stack frame at the top of 4K page + # with 1K reserved for eventual signal handling + lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer + lghi $s1,-4096 + algr $s0,$sp + lgr $fp,$sp + ngr $s0,$s1 # align at page boundary + slgr $fp,$s0 # total buffer size + lgr $s2,$sp + lghi $s1,1024+16 # sl[g]fi is extended-immediate facility + slgr $fp,$s1 # deduct reservation to get usable buffer size + # buffer size is at lest 256 and at most 3072+256-16 + + la $sp,1024($s0) # alloca + nill $fp,0xfff0 # round to 16*n + st${g} $s2,0($sp) # back-chain + nill $len,0xfff0 # redundant + st${g} $fp,$SIZE_T($sp) + + slgr $len,$fp + brc 1,.Lxts_km_go # not zero, no borrow + algr $fp,$len # input is shorter than allocated buffer + lghi $len,0 + st${g} $fp,$SIZE_T($sp) + +.Lxts_km_go: + lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian + lrvg $s1,$tweak+8($s2) + + la $s2,16($sp) # vector of ascending tweak values + slgr $s2,$inp + srlg $s3,$fp,4 + j .Lxts_km_start + +.Lxts_km_loop: + la $s2,16($sp) + slgr $s2,$inp + srlg $s3,$fp,4 +.Lxts_km_prepare: + lghi $i1,0x87 + srag $i2,$s1,63 # broadcast upper bit + ngr $i1,$i2 # rem + srlg $i2,$s0,63 # carry bit from lower half + sllg $s0,$s0,1 + sllg $s1,$s1,1 + xgr $s0,$i1 + ogr $s1,$i2 +.Lxts_km_start: + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + stg $i1,0($s2,$inp) + stg $i2,8($s2,$inp) + xg $i1,0($inp) + xg $i2,8($inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + la $inp,16($inp) + brct $s3,.Lxts_km_prepare + + slgr $inp,$fp # rewind $inp + la $s2,0($out,$inp) + lgr $s3,$fp + .long 0xb92e00aa # km $s2,$s2 + brc 1,.-4 # pay attention to "partial completion" + + la $s2,16($sp) + slgr $s2,$inp + srlg $s3,$fp,4 +.Lxts_km_xor: + lg $i1,0($out,$inp) + lg $i2,8($out,$inp) + xg $i1,0($s2,$inp) + xg $i2,8($s2,$inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + la $inp,16($inp) + brct $s3,.Lxts_km_xor + + slgr $len,$fp + brc 1,.Lxts_km_loop # not zero, no borrow + algr $fp,$len + lghi $len,0 + brc 4+1,.Lxts_km_loop # not zero + + l${g} $i1,0($sp) # back-chain + llgf $fp,`2*$SIZE_T-4`($sp) # bytes used + la $i2,16($sp) + srlg $fp,$fp,4 +.Lxts_km_zap: + stg $i1,0($i2) + stg $i1,8($i2) + la $i2,16($i2) + brct $fp,.Lxts_km_zap + + la $sp,0($i1) + llgc $len,2*$SIZE_T-1($i1) + nill $len,0x0f # $len%=16 + bzr $ra + + # generate one more tweak... + lghi $i1,0x87 + srag $i2,$s1,63 # broadcast upper bit + ngr $i1,$i2 # rem + srlg $i2,$s0,63 # carry bit from lower half + sllg $s0,$s0,1 + sllg $s1,$s1,1 + xgr $s0,$i1 + ogr $s1,$i2 + + ltr $len,$len # clear zero flag + br $ra +.size _s390x_xts_km,.-_s390x_xts_km + +.globl AES_xts_encrypt +.type AES_xts_encrypt,\@function +.align 16 +AES_xts_encrypt: + xgr %r3,%r4 # flip %r3 and %r4, $out and $len + xgr %r4,%r3 + xgr %r3,%r4 +___ +$code.=<<___ if ($SIZE_T==4); + llgfr $len,$len +___ +$code.=<<___; + st${g} $len,1*$SIZE_T($sp) # save copy of $len + srag $len,$len,4 # formally wrong, because it expands + # sign byte, but who can afford asking + # to process more than 2^63-1 bytes? + # I use it, because it sets condition + # code... + bcr 8,$ra # abort if zero (i.e. less than 16) +___ +$code.=<<___ if (!$softonly); + llgf %r0,240($key2) + lhi %r1,16 + clr %r0,%r1 + jl .Lxts_enc_software + + stm${g} %r6,$s3,6*$SIZE_T($sp) + st${g} $ra,14*$SIZE_T($sp) + + sllg $len,$len,4 # $len&=~15 + slgr $out,$inp + + # generate the tweak value + l${g} $s3,$stdframe($sp) # pointer to iv + la $s2,$tweak($sp) + lmg $s0,$s1,0($s3) + lghi $s3,16 + stmg $s0,$s1,0($s2) + la %r1,0($key2) # $key2 is not needed anymore + .long 0xb92e00aa # km $s2,$s2, generate the tweak + brc 1,.-4 # can this happen? + + l %r0,240($key1) + la %r1,0($key1) # $key1 is not needed anymore + bras $ra,_s390x_xts_km + jz .Lxts_enc_km_done + + aghi $inp,-16 # take one step back + la $i3,0($out,$inp) # put aside real $out +.Lxts_enc_km_steal: + llgc $i1,16($inp) + llgc $i2,0($out,$inp) + stc $i1,0($out,$inp) + stc $i2,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_enc_km_steal + + la $s2,0($i3) + lghi $s3,16 + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + xg $i1,0($s2) + xg $i2,8($s2) + stg $i1,0($s2) + stg $i2,8($s2) + .long 0xb92e00aa # km $s2,$s2 + brc 1,.-4 # can this happen? + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + xg $i1,0($i3) + xg $i2,8($i3) + stg $i1,0($i3) + stg $i2,8($i3) + +.Lxts_enc_km_done: + l${g} $ra,14*$SIZE_T($sp) + st${g} $sp,$tweak($sp) # wipe tweak + st${g} $sp,$tweak($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +.Lxts_enc_software: +___ +$code.=<<___; + stm${g} %r6,$ra,6*$SIZE_T($sp) + + slgr $out,$inp + + xgr $s0,$s0 # clear upper half + xgr $s1,$s1 + lrv $s0,$stdframe+4($sp) # load secno + lrv $s1,$stdframe+0($sp) + xgr $s2,$s2 + xgr $s3,$s3 + stm${g} %r2,%r5,2*$SIZE_T($sp) + la $key,0($key2) + larl $tbl,AES_Te + bras $ra,_s390x_AES_encrypt # generate the tweak + lm${g} %r2,%r5,2*$SIZE_T($sp) + stm $s0,$s3,$tweak($sp) # save the tweak + j .Lxts_enc_enter + +.align 16 +.Lxts_enc_loop: + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak+0($sp) # save the tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak+8($sp) + llgfr $s3,$s3 + la $inp,16($inp) # $inp+=16 +.Lxts_enc_enter: + x $s0,0($inp) # ^=*($inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing + la $key,0($key1) + bras $ra,_s390x_AES_encrypt + lm${g} %r2,%r5,2*$SIZE_T($sp) + x $s0,$tweak+0($sp) # ^=tweak + x $s1,$tweak+4($sp) + x $s2,$tweak+8($sp) + x $s3,$tweak+12($sp) + st $s0,0($out,$inp) + st $s1,4($out,$inp) + st $s2,8($out,$inp) + st $s3,12($out,$inp) + brct${g} $len,.Lxts_enc_loop + + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%16 + jz .Lxts_enc_done + + la $i3,0($inp,$out) # put aside real $out +.Lxts_enc_steal: + llgc %r0,16($inp) + llgc %r1,0($out,$inp) + stc %r0,0($out,$inp) + stc %r1,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_enc_steal + la $out,0($i3) # restore real $out + + # generate last tweak... + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak+0($sp) # save the tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak+8($sp) + llgfr $s3,$s3 + + x $s0,0($out) # ^=*(inp)|stolen cipther-text + x $s1,4($out) + x $s2,8($out) + x $s3,12($out) + st${g} $out,4*$SIZE_T($sp) + la $key,0($key1) + bras $ra,_s390x_AES_encrypt + l${g} $out,4*$SIZE_T($sp) + x $s0,`$tweak+0`($sp) # ^=tweak + x $s1,`$tweak+4`($sp) + x $s2,`$tweak+8`($sp) + x $s3,`$tweak+12`($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + +.Lxts_enc_done: + stg $sp,$tweak+0($sp) # wipe tweak + stg $sp,$twesk+8($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) + br $ra +.size AES_xts_encrypt,.-AES_xts_encrypt +___ +# void AES_xts_decrypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2,u64 secno); +# +$code.=<<___; +.globl AES_xts_decrypt +.type AES_xts_decrypt,\@function +.align 16 +AES_xts_decrypt: + xgr %r3,%r4 # flip %r3 and %r4, $out and $len + xgr %r4,%r3 + xgr %r3,%r4 +___ +$code.=<<___ if ($SIZE_T==4); + llgfr $len,$len +___ +$code.=<<___; + st${g} $len,1*$SIZE_T($sp) # save copy of $len + aghi $len,-16 + bcr 4,$ra # abort if less than zero. formally + # wrong, because $len is unsigned, + # but who can afford asking to + # process more than 2^63-1 bytes? + tmll $len,0x0f + jnz .Lxts_dec_proceed + aghi $len,16 +.Lxts_dec_proceed: +___ +$code.=<<___ if (!$softonly); + llgf %r0,240($key2) + lhi %r1,16 + clr %r0,%r1 + jl .Lxts_dec_software + + stm${g} %r6,$s3,6*$SIZE_T($sp) + st${g} $ra,14*$SIZE_T($sp) + + nill $len,0xfff0 # $len&=~15 + slgr $out,$inp + + # generate the tweak value + l${g} $s3,$stdframe($sp) # pointer to iv + la $s2,$tweak($sp) + lmg $s0,$s1,0($s3) + lghi $s3,16 + stmg $s0,$s1,0($s2) + la %r1,0($key2) # $key2 is not needed past this point + .long 0xb92e00aa # km $s2,$s2, generate the tweak + brc 1,.-4 # can this happen? + + l %r0,240($key1) + la %r1,0($key1) # $key1 is not needed anymore + + ltgr $len,$len + jz .Lxts_dec_km_short + bras $ra,_s390x_xts_km + jz .Lxts_dec_km_done + + lrvgr $s2,$s0 # make copy in reverse byte order + lrvgr $s3,$s1 + j .Lxts_dec_km_2ndtweak + +.Lxts_dec_km_short: + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%=16 + lrvg $s0,$tweak+0($sp) # load the tweak + lrvg $s1,$tweak+8($sp) + lrvgr $s2,$s0 # make copy in reverse byte order + lrvgr $s3,$s1 + +.Lxts_dec_km_2ndtweak: + lghi $i1,0x87 + srag $i2,$s1,63 # broadcast upper bit + ngr $i1,$i2 # rem + srlg $i2,$s0,63 # carry bit from lower half + sllg $s0,$s0,1 + sllg $s1,$s1,1 + xgr $s0,$i1 + ogr $s1,$i2 + lrvgr $i1,$s0 # flip byte order + lrvgr $i2,$s1 + + xg $i1,0($inp) + xg $i2,8($inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + la $i2,0($out,$inp) + lghi $i3,16 + .long 0xb92e0066 # km $i2,$i2 + brc 1,.-4 # can this happen? + lrvgr $i1,$s0 + lrvgr $i2,$s1 + xg $i1,0($out,$inp) + xg $i2,8($out,$inp) + stg $i1,0($out,$inp) + stg $i2,8($out,$inp) + + la $i3,0($out,$inp) # put aside real $out +.Lxts_dec_km_steal: + llgc $i1,16($inp) + llgc $i2,0($out,$inp) + stc $i1,0($out,$inp) + stc $i2,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_dec_km_steal + + lgr $s0,$s2 + lgr $s1,$s3 + xg $s0,0($i3) + xg $s1,8($i3) + stg $s0,0($i3) + stg $s1,8($i3) + la $s0,0($i3) + lghi $s1,16 + .long 0xb92e0088 # km $s0,$s0 + brc 1,.-4 # can this happen? + xg $s2,0($i3) + xg $s3,8($i3) + stg $s2,0($i3) + stg $s3,8($i3) +.Lxts_dec_km_done: + l${g} $ra,14*$SIZE_T($sp) + st${g} $sp,$tweak($sp) # wipe tweak + st${g} $sp,$tweak($sp) + lm${g} %r6,$s3,6*$SIZE_T($sp) + br $ra +.align 16 +.Lxts_dec_software: +___ +$code.=<<___; + stm${g} %r6,$ra,6*$SIZE_T($sp) + + srlg $len,$len,4 + slgr $out,$inp + + xgr $s0,$s0 # clear upper half + xgr $s1,$s1 + lrv $s0,$stdframe+4($sp) # load secno + lrv $s1,$stdframe+0($sp) + xgr $s2,$s2 + xgr $s3,$s3 + stm${g} %r2,%r5,2*$SIZE_T($sp) + la $key,0($key2) + larl $tbl,AES_Te + bras $ra,_s390x_AES_encrypt # generate the tweak + lm${g} %r2,%r5,2*$SIZE_T($sp) + larl $tbl,AES_Td + lt${g}r $len,$len + stm $s0,$s3,$tweak($sp) # save the tweak + jz .Lxts_dec_short + j .Lxts_dec_enter + +.align 16 +.Lxts_dec_loop: + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak+0($sp) # save the tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak+8($sp) + llgfr $s3,$s3 +.Lxts_dec_enter: + x $s0,0($inp) # tweak^=*(inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing + la $key,0($key1) + bras $ra,_s390x_AES_decrypt + lm${g} %r2,%r5,2*$SIZE_T($sp) + x $s0,$tweak+0($sp) # ^=tweak + x $s1,$tweak+4($sp) + x $s2,$tweak+8($sp) + x $s3,$tweak+12($sp) + st $s0,0($out,$inp) + st $s1,4($out,$inp) + st $s2,8($out,$inp) + st $s3,12($out,$inp) + la $inp,16($inp) + brct${g} $len,.Lxts_dec_loop + + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%16 + jz .Lxts_dec_done + + # generate pair of tweaks... + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $i2,$s1 # flip byte order + lrvgr $i3,$s3 + stmg $i2,$i3,$tweak($sp) # save the 1st tweak + j .Lxts_dec_2ndtweak + +.align 16 +.Lxts_dec_short: + llgc $len,`2*$SIZE_T-1`($sp) + nill $len,0x0f # $len%16 + lrvg $s1,$tweak+0($sp) # load the tweak in little-endian + lrvg $s3,$tweak+8($sp) +.Lxts_dec_2ndtweak: + lghi %r1,0x87 + srag %r0,$s3,63 # broadcast upper bit + ngr %r1,%r0 # rem + srlg %r0,$s1,63 # carry bit from lower half + sllg $s1,$s1,1 + sllg $s3,$s3,1 + xgr $s1,%r1 + ogr $s3,%r0 + lrvgr $s1,$s1 # flip byte order + lrvgr $s3,$s3 + srlg $s0,$s1,32 # smash the tweak to 4x32-bits + stg $s1,$tweak-16+0($sp) # save the 2nd tweak + llgfr $s1,$s1 + srlg $s2,$s3,32 + stg $s3,$tweak-16+8($sp) + llgfr $s3,$s3 + + x $s0,0($inp) # tweak_the_2nd^=*(inp) + x $s1,4($inp) + x $s2,8($inp) + x $s3,12($inp) + stm${g} %r2,%r3,2*$SIZE_T($sp) + la $key,0($key1) + bras $ra,_s390x_AES_decrypt + lm${g} %r2,%r5,2*$SIZE_T($sp) + x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd + x $s1,$tweak-16+4($sp) + x $s2,$tweak-16+8($sp) + x $s3,$tweak-16+12($sp) + st $s0,0($out,$inp) + st $s1,4($out,$inp) + st $s2,8($out,$inp) + st $s3,12($out,$inp) + + la $i3,0($out,$inp) # put aside real $out +.Lxts_dec_steal: + llgc %r0,16($inp) + llgc %r1,0($out,$inp) + stc %r0,0($out,$inp) + stc %r1,16($out,$inp) + la $inp,1($inp) + brct $len,.Lxts_dec_steal + la $out,0($i3) # restore real $out + + lm $s0,$s3,$tweak($sp) # load the 1st tweak + x $s0,0($out) # tweak^=*(inp)|stolen cipher-text + x $s1,4($out) + x $s2,8($out) + x $s3,12($out) + st${g} $out,4*$SIZE_T($sp) + la $key,0($key1) + bras $ra,_s390x_AES_decrypt + l${g} $out,4*$SIZE_T($sp) + x $s0,$tweak+0($sp) # ^=tweak + x $s1,$tweak+4($sp) + x $s2,$tweak+8($sp) + x $s3,$tweak+12($sp) + st $s0,0($out) + st $s1,4($out) + st $s2,8($out) + st $s3,12($out) + stg $sp,$tweak-16+0($sp) # wipe 2nd tweak + stg $sp,$tweak-16+8($sp) +.Lxts_dec_done: + stg $sp,$tweak+0($sp) # wipe tweak + stg $sp,$twesk+8($sp) + lm${g} %r6,$ra,6*$SIZE_T($sp) + br $ra +.size AES_xts_decrypt,.-AES_xts_decrypt ___ } $code.=<<___; .string "AES for s390x, CRYPTOGAMS by " +.comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; +close STDOUT; # force flush diff --git a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl index c57b3a2d6d..403c4d1290 100755 --- a/src/lib/libcrypto/aes/asm/aes-sparcv9.pl +++ b/src/lib/libcrypto/aes/asm/aes-sparcv9.pl @@ -1176,6 +1176,7 @@ ___ # As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have # undesired effect, so just omit them and sacrifice some portion of # percent in performance... -$code =~ s/fmovs.*$//gem; +$code =~ s/fmovs.*$//gm; print $code; +close STDOUT; # ensure flush diff --git a/src/lib/libcrypto/aes/asm/aes-x86_64.pl b/src/lib/libcrypto/aes/asm/aes-x86_64.pl index a545e892ae..48fa857d5b 100755 --- a/src/lib/libcrypto/aes/asm/aes-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aes-x86_64.pl @@ -588,6 +588,9 @@ $code.=<<___; .globl AES_encrypt .type AES_encrypt,\@function,3 .align 16 +.globl asm_AES_encrypt +.hidden asm_AES_encrypt +asm_AES_encrypt: AES_encrypt: push %rbx push %rbp @@ -1184,6 +1187,9 @@ $code.=<<___; .globl AES_decrypt .type AES_decrypt,\@function,3 .align 16 +.globl asm_AES_decrypt +.hidden asm_AES_decrypt +asm_AES_decrypt: AES_decrypt: push %rbx push %rbp @@ -1277,13 +1283,13 @@ $code.=<<___; ___ } -# int AES_set_encrypt_key(const unsigned char *userKey, const int bits, +# int private_AES_set_encrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) $code.=<<___; -.globl AES_set_encrypt_key -.type AES_set_encrypt_key,\@function,3 +.globl private_AES_set_encrypt_key +.type private_AES_set_encrypt_key,\@function,3 .align 16 -AES_set_encrypt_key: +private_AES_set_encrypt_key: push %rbx push %rbp push %r12 # redundant, but allows to share @@ -1304,7 +1310,7 @@ AES_set_encrypt_key: add \$56,%rsp .Lenc_key_epilogue: ret -.size AES_set_encrypt_key,.-AES_set_encrypt_key +.size private_AES_set_encrypt_key,.-private_AES_set_encrypt_key .type _x86_64_AES_set_encrypt_key,\@abi-omnipotent .align 16 @@ -1547,13 +1553,13 @@ $code.=<<___; ___ } -# int AES_set_decrypt_key(const unsigned char *userKey, const int bits, +# int private_AES_set_decrypt_key(const unsigned char *userKey, const int bits, # AES_KEY *key) $code.=<<___; -.globl AES_set_decrypt_key -.type AES_set_decrypt_key,\@function,3 +.globl private_AES_set_decrypt_key +.type private_AES_set_decrypt_key,\@function,3 .align 16 -AES_set_decrypt_key: +private_AES_set_decrypt_key: push %rbx push %rbp push %r12 @@ -1622,7 +1628,7 @@ $code.=<<___; add \$56,%rsp .Ldec_key_epilogue: ret -.size AES_set_decrypt_key,.-AES_set_decrypt_key +.size private_AES_set_decrypt_key,.-private_AES_set_decrypt_key ___ # void AES_cbc_encrypt (const void char *inp, unsigned char *out, @@ -1648,6 +1654,9 @@ $code.=<<___; .type AES_cbc_encrypt,\@function,6 .align 16 .extern OPENSSL_ia32cap_P +.globl asm_AES_cbc_encrypt +.hidden asm_AES_cbc_encrypt +asm_AES_cbc_encrypt: AES_cbc_encrypt: cmp \$0,%rdx # check length je .Lcbc_epilogue @@ -2766,13 +2775,13 @@ cbc_se_handler: .rva .LSEH_end_AES_decrypt .rva .LSEH_info_AES_decrypt - .rva .LSEH_begin_AES_set_encrypt_key - .rva .LSEH_end_AES_set_encrypt_key - .rva .LSEH_info_AES_set_encrypt_key + .rva .LSEH_begin_private_AES_set_encrypt_key + .rva .LSEH_end_private_AES_set_encrypt_key + .rva .LSEH_info_private_AES_set_encrypt_key - .rva .LSEH_begin_AES_set_decrypt_key - .rva .LSEH_end_AES_set_decrypt_key - .rva .LSEH_info_AES_set_decrypt_key + .rva .LSEH_begin_private_AES_set_decrypt_key + .rva .LSEH_end_private_AES_set_decrypt_key + .rva .LSEH_info_private_AES_set_decrypt_key .rva .LSEH_begin_AES_cbc_encrypt .rva .LSEH_end_AES_cbc_encrypt @@ -2788,11 +2797,11 @@ cbc_se_handler: .byte 9,0,0,0 .rva block_se_handler .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[] -.LSEH_info_AES_set_encrypt_key: +.LSEH_info_private_AES_set_encrypt_key: .byte 9,0,0,0 .rva key_se_handler .rva .Lenc_key_prologue,.Lenc_key_epilogue # HandlerData[] -.LSEH_info_AES_set_decrypt_key: +.LSEH_info_private_AES_set_decrypt_key: .byte 9,0,0,0 .rva key_se_handler .rva .Ldec_key_prologue,.Ldec_key_epilogue # HandlerData[] diff --git a/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl new file mode 100644 index 0000000000..c6f6b3334a --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aesni-sha1-x86_64.pl @@ -0,0 +1,1249 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# June 2011 +# +# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled +# in http://download.intel.com/design/intarch/papers/323686.pdf, is +# that since AESNI-CBC encrypt exhibit *very* low instruction-level +# parallelism, interleaving it with another algorithm would allow to +# utilize processor resources better and achieve better performance. +# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and +# AESNI code is weaved into it. Below are performance numbers in +# cycles per processed byte, less is better, for standalone AESNI-CBC +# encrypt, sum of the latter and standalone SHA1, and "stitched" +# subroutine: +# +# AES-128-CBC +SHA1 stitch gain +# Westmere 3.77[+5.6] 9.37 6.65 +41% +# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) +# +# AES-192-CBC +# Westmere 4.51 10.11 6.97 +45% +# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) +# +# AES-256-CBC +# Westmere 5.25 10.85 7.25 +50% +# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) +# +# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for +# background information. Above numbers in parentheses are SSSE3 +# results collected on AVX-capable CPU, i.e. apply on OSes that +# don't support AVX. +# +# Needless to mention that it makes no sense to implement "stitched" +# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 +# fully utilize parallelism, so stitching would not give any gain +# anyway. Well, there might be some, e.g. because of better cache +# locality... For reference, here are performance results for +# standalone AESNI-CBC decrypt: +# +# AES-128-CBC AES-192-CBC AES-256-CBC +# Westmere 1.31 1.55 1.80 +# Sandy Bridge 0.93 1.06 1.22 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); +$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.09); +$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./ && + $1>=10); + +open STDOUT,"| $^X $xlate $flavour $output"; + +# void aesni_cbc_sha1_enc(const void *inp, +# void *out, +# size_t length, +# const AES_KEY *key, +# unsigned char *iv, +# SHA_CTX *ctx, +# const void *in0); + +$code.=<<___; +.text +.extern OPENSSL_ia32cap_P + +.globl aesni_cbc_sha1_enc +.type aesni_cbc_sha1_enc,\@abi-omnipotent +.align 16 +aesni_cbc_sha1_enc: + # caller should check for SSSE3 and AES-NI bits + mov OPENSSL_ia32cap_P+0(%rip),%r10d + mov OPENSSL_ia32cap_P+4(%rip),%r11d +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r11d # mask AVX bit + and \$`1<<30`,%r10d # mask "Intel CPU" bit + or %r11d,%r10d + cmp \$`1<<28|1<<30`,%r10d + je aesni_cbc_sha1_enc_avx +___ +$code.=<<___; + jmp aesni_cbc_sha1_enc_ssse3 + ret +.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc +___ + +my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; my $jj=0; my $r=0; my $sn=0; +my $K_XX_XX="%r11"; +my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13)); +my @rndkey=("%xmm14","%xmm15"); + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +$code.=<<___; +.type aesni_cbc_sha1_enc_ssse3,\@function,6 +.align 16 +aesni_cbc_sha1_enc_ssse3: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + #shr \$6,$len # debugging artefact + #jz .Lepilogue_ssse3 # debugging artefact + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp + #mov $in0,$inp # debugging artefact + #lea 64(%rsp),$ctx # debugging artefact +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_ssse3: +___ +$code.=<<___; + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + mov $key,%r15 + movdqu ($ivp),$iv # load IV + mov $ivp,88(%rsp) # save $ivp +___ +my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +my $rounds="${ivp}d"; +$code.=<<___; + shl \$6,$len + sub $in0,$out + mov 240($key),$rounds + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @X[2],@X[-4&7] # byte swap + add \$64,$inp + pshufb @X[2],@X[-3&7] + pshufb @X[2],@X[-2&7] + pshufb @X[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + movups ($key),$rndkey0 # $key[0] + movups 16($key),$rndkey[0] # forward reference + jmp .Loop_ssse3 +___ + +my $aesenc=sub { + use integer; + my ($n,$k)=($r/10,$r%10); + if ($k==0) { + $code.=<<___; + movups `16*$n`($in0),$in # load input + xorps $rndkey0,$in +___ + $code.=<<___ if ($n); + movups $iv,`16*($n-1)`($out,$in0) # write output +___ + $code.=<<___; + xorps $in,$iv + aesenc $rndkey[0],$iv + movups `32+16*$k`($key),$rndkey[1] +___ + } elsif ($k==9) { + $sn++; + $code.=<<___; + cmp \$11,$rounds + jb .Laesenclast$sn + movups `32+16*($k+0)`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+1)`($key),$rndkey[0] + aesenc $rndkey[1],$iv + je .Laesenclast$sn + movups `32+16*($k+2)`($key),$rndkey[1] + aesenc $rndkey[0],$iv + movups `32+16*($k+3)`($key),$rndkey[0] + aesenc $rndkey[1],$iv +.Laesenclast$sn: + aesenclast $rndkey[0],$iv + movups 16($key),$rndkey[1] # forward reference +___ + } else { + $code.=<<___; + aesenc $rndkey[0],$iv + movups `32+16*$k`($key),$rndkey[1] +___ + } + $r++; unshift(@rndkey,pop(@rndkey)); +}; + +sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@X[0],@X[-3&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[0],@X[-1&7]); + &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq (@Tx[0],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@Tx[2],@X[0]); + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[1],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[2],30); + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@Tx[1],2); + &pxor (@X[0],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); + eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &movdqa (@Tx[0],@X[0]); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pslld (@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &psrld (@Tx[0],30); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &por (@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$len); + &je (".Ldone_ssse3"); + + unshift(@Tx,pop(@Tx)); + + &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 + &movdqu (@X[-4&7],"0($inp)"); # load input + &movdqu (@X[-3&7],"16($inp)"); + &movdqu (@X[-2&7],"32($inp)"); + &movdqu (@X[-1&7],"48($inp)"); + &pshufb (@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@Tx[1]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { + use integer; + my ($k,$n); + my @r=( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer + '&xor ($c,$d);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&xor ($c,$d);', # restore $c + '&xor (@T[0],$d);', + '&add ($e,$a);', + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); + $n = scalar(@r); + $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + $jj++; + return @r; +} + +sub body_20_39 () { + use integer; + my ($k,$n); + my @r=( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&xor (@T[0],$d);', # ($b^$d) + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&xor (@T[0],$c);', # ($b^$d^$c) + '&add ($e,$a);', + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); + $n = scalar(@r); + $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + $jj++; + return @r; +} + +sub body_40_59 () { + use integer; + my ($k,$n); + my @r=( + '($a,$b,$c,$d,$e)=@V;'. + '&mov (@T[1],$c);', + '&xor ($c,$d);', + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&and (@T[1],$d);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[1]);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor ($c,$d);', # restore $c + '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); + $n = scalar(@r); + $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds + @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); + $jj++; + return @r; +} +$code.=<<___; +.align 16 +.Loop_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_r=$r; @saved_rndkey=@rndkey; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + +$code.=<<___; + movups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: +___ + $jj=$j=$saved_j; @V=@saved_V; + $r=$saved_r; @rndkey=@saved_rndkey; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + +$code.=<<___; + movups $iv,48($out,$in0) # write output + mov 88(%rsp),$ivp # restore $ivp + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + movups $iv,($ivp) # write IV +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_ssse3: + ret +.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 +___ + +$j=$jj=$r=$sn=0; + +if ($avx) { +my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); + +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type aesni_cbc_sha1_enc_avx,\@function,6 +.align 16 +aesni_cbc_sha1_enc_avx: + mov `($win64?56:8)`(%rsp),$inp # load 7th argument + #shr \$6,$len # debugging artefact + #jz .Lepilogue_avx # debugging artefact + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + lea `-104-($win64?10*16:0)`(%rsp),%rsp + #mov $in0,$inp # debugging artefact + #lea 64(%rsp),$ctx # debugging artefact +___ +$code.=<<___ if ($win64); + movaps %xmm6,96+0(%rsp) + movaps %xmm7,96+16(%rsp) + movaps %xmm8,96+32(%rsp) + movaps %xmm9,96+48(%rsp) + movaps %xmm10,96+64(%rsp) + movaps %xmm11,96+80(%rsp) + movaps %xmm12,96+96(%rsp) + movaps %xmm13,96+112(%rsp) + movaps %xmm14,96+128(%rsp) + movaps %xmm15,96+144(%rsp) +.Lprologue_avx: +___ +$code.=<<___; + vzeroall + mov $in0,%r12 # reassign arguments + mov $out,%r13 + mov $len,%r14 + mov $key,%r15 + vmovdqu ($ivp),$iv # load IV + mov $ivp,88(%rsp) # save $ivp +___ +my ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments +my $rounds="${ivp}d"; +$code.=<<___; + shl \$6,$len + sub $in0,$out + mov 240($key),$rounds + add \$112,$key # size optimization + add $inp,$len # end of input + + lea K_XX_XX(%rip),$K_XX_XX + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 + vpaddd @Tx[1],@X[-3&7],@X[1] + vpaddd @Tx[1],@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + vmovups -112($key),$rndkey0 # $key[0] + vmovups 16-112($key),$rndkey[0] # forward reference + jmp .Loop_avx +___ + +my $aesenc=sub { + use integer; + my ($n,$k)=($r/10,$r%10); + if ($k==0) { + $code.=<<___; + vmovups `16*$n`($in0),$in # load input + vxorps $rndkey0,$in,$in +___ + $code.=<<___ if ($n); + vmovups $iv,`16*($n-1)`($out,$in0) # write output +___ + $code.=<<___; + vxorps $in,$iv,$iv + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*$k-112`($key),$rndkey[1] +___ + } elsif ($k==9) { + $sn++; + $code.=<<___; + cmp \$11,$rounds + jb .Lvaesenclast$sn + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*($k+0)-112`($key),$rndkey[1] + vaesenc $rndkey[1],$iv,$iv + vmovups `32+16*($k+1)-112`($key),$rndkey[0] + je .Lvaesenclast$sn + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*($k+2)-112`($key),$rndkey[1] + vaesenc $rndkey[1],$iv,$iv + vmovups `32+16*($k+3)-112`($key),$rndkey[0] +.Lvaesenclast$sn: + vaesenclast $rndkey[0],$iv,$iv + vmovups 16-112($key),$rndkey[1] # forward reference +___ + } else { + $code.=<<___; + vaesenc $rndkey[0],$iv,$iv + vmovups `32+16*$k-112`($key),$rndkey[1] +___ + } + $r++; unshift(@rndkey,pop(@rndkey)); +}; + +sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@Tx[0],@X[0],30); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$len); + &je (".Ldone_avx"); + + unshift(@Tx,pop(@Tx)); + + &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask + &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 + &vmovdqu(@X[-4&7],"0($inp)"); # load input + &vmovdqu(@X[-3&7],"16($inp)"); + &vmovdqu(@X[-2&7],"32($inp)"); + &vmovdqu(@X[-1&7],"48($inp)"); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +$code.=<<___; +.align 16 +.Loop_avx: +___ + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + $saved_r=$r; @saved_rndkey=@rndkey; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + +$code.=<<___; + vmovups $iv,48($out,$in0) # write output + lea 64($in0),$in0 + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_avx + +.align 16 +.Ldone_avx: +___ + $jj=$j=$saved_j; @V=@saved_V; + $r=$saved_r; @rndkey=@saved_rndkey; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + +$code.=<<___; + vmovups $iv,48($out,$in0) # write output + mov 88(%rsp),$ivp # restore $ivp + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + vmovups $iv,($ivp) # write IV + vzeroall +___ +$code.=<<___ if ($win64); + movaps 96+0(%rsp),%xmm6 + movaps 96+16(%rsp),%xmm7 + movaps 96+32(%rsp),%xmm8 + movaps 96+48(%rsp),%xmm9 + movaps 96+64(%rsp),%xmm10 + movaps 96+80(%rsp),%xmm11 + movaps 96+96(%rsp),%xmm12 + movaps 96+112(%rsp),%xmm13 + movaps 96+128(%rsp),%xmm14 + movaps 96+144(%rsp),%xmm15 +___ +$code.=<<___; + lea `104+($win64?10*16:0)`(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx: + ret +.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx +___ +} +$code.=<<___; +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask + +.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by " +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 96(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + lea `104+10*16`(%rax),%rax # adjust stack pointer + + mov 0(%rax),%r15 + mov 8(%rax),%r14 + mov 16(%rax),%r13 + mov 24(%rax),%r12 + mov 32(%rax),%rbp + mov 40(%rax),%rbx + lea 48(%rax),%rax + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size ssse3_handler,.-ssse3_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 + .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 + .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_aesni_cbc_sha1_enc_avx + .rva .LSEH_end_aesni_cbc_sha1_enc_avx + .rva .LSEH_info_aesni_cbc_sha1_enc_avx +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_aesni_cbc_sha1_enc_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_aesni_cbc_sha1_enc_avx: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] +___ +} + +#################################################################### +sub rex { + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); +} + +sub aesni { + my $line=shift; + my @opcode=(0x66); + + if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my %opcodelet = ( + "aesenc" => 0xdc, "aesenclast" => 0xdd + ); + return undef if (!defined($opcodelet{$1})); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x38,$opcodelet{$1}; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } + return $line; +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; + +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/aes/asm/aesni-x86.pl b/src/lib/libcrypto/aes/asm/aesni-x86.pl new file mode 100644 index 0000000000..3dc345b585 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/aesni-x86.pl @@ -0,0 +1,2189 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements support for Intel AES-NI extension. In +# OpenSSL context it's used with Intel engine, but can also be used as +# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for +# details]. +# +# Performance. +# +# To start with see corresponding paragraph in aesni-x86_64.pl... +# Instead of filling table similar to one found there I've chosen to +# summarize *comparison* results for raw ECB, CTR and CBC benchmarks. +# The simplified table below represents 32-bit performance relative +# to 64-bit one in every given point. Ratios vary for different +# encryption modes, therefore interval values. +# +# 16-byte 64-byte 256-byte 1-KB 8-KB +# 53-67% 67-84% 91-94% 95-98% 97-99.5% +# +# Lower ratios for smaller block sizes are perfectly understandable, +# because function call overhead is higher in 32-bit mode. Largest +# 8-KB block performance is virtually same: 32-bit code is less than +# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise. + +# January 2011 +# +# See aesni-x86_64.pl for details. Unlike x86_64 version this module +# interleaves at most 6 aes[enc|dec] instructions, because there are +# not enough registers for 8x interleave [which should be optimal for +# Sandy Bridge]. Actually, performance results for 6x interleave +# factor presented in aesni-x86_64.pl (except for CTR) are for this +# module. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09. + +$PREFIX="aesni"; # if $PREFIX is set to "AES", the script + # generates drop-in replacement for + # crypto/aes/asm/aes-586.pl:-) +$inline=1; # inline _aesni_[en|de]crypt + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0); + +if ($PREFIX eq "aesni") { $movekey=*movups; } +else { $movekey=*movups; } + +$len="eax"; +$rounds="ecx"; +$key="edx"; +$inp="esi"; +$out="edi"; +$rounds_="ebx"; # backup copy for $rounds +$key_="ebp"; # backup copy for $key + +$rndkey0="xmm0"; +$rndkey1="xmm1"; +$inout0="xmm2"; +$inout1="xmm3"; +$inout2="xmm4"; +$inout3="xmm5"; $in1="xmm5"; +$inout4="xmm6"; $in0="xmm6"; +$inout5="xmm7"; $ivec="xmm7"; + +# AESNI extenstion +sub aeskeygenassist +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); } +} +sub aescommon +{ my($opcodelet,$dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);} +} +sub aesimc { aescommon(0xdb,@_); } +sub aesenc { aescommon(0xdc,@_); } +sub aesenclast { aescommon(0xdd,@_); } +sub aesdec { aescommon(0xde,@_); } +sub aesdeclast { aescommon(0xdf,@_); } + +# Inline version of internal aesni_[en|de]crypt1 +{ my $sn; +sub aesni_inline_generate1 +{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); + $sn++; + + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &xorps ($ivec,$rndkey0) if (defined($ivec)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout,$ivec) if (defined($ivec)); + &xorps ($inout,$rndkey0) if (!defined($ivec)); + &set_label("${p}1_loop_$sn"); + eval"&aes${p} ($inout,$rndkey1)"; + &dec ($rounds); + &$movekey ($rndkey1,&QWP(0,$key)); + &lea ($key,&DWP(16,$key)); + &jnz (&label("${p}1_loop_$sn")); + eval"&aes${p}last ($inout,$rndkey1)"; +}} + +sub aesni_generate1 # fully unrolled loop +{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout)); + + &function_begin_B("_aesni_${p}rypt1"); + &movups ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(0x10,$key)); + &xorps ($inout,$rndkey0); + &$movekey ($rndkey0,&QWP(0x20,$key)); + &lea ($key,&DWP(0x30,$key)); + &cmp ($rounds,11); + &jb (&label("${p}128")); + &lea ($key,&DWP(0x20,$key)); + &je (&label("${p}192")); + &lea ($key,&DWP(0x20,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x40,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x30,$key)); + &set_label("${p}192"); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(-0x20,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(-0x10,$key)); + &set_label("${p}128"); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x10,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x20,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x30,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x40,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x50,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + &$movekey ($rndkey1,&QWP(0x60,$key)); + eval"&aes${p} ($inout,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0x70,$key)); + eval"&aes${p} ($inout,$rndkey1)"; + eval"&aes${p}last ($inout,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt1"); +} + +# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key); +&aesni_generate1("enc") if (!$inline); +&function_begin_B("${PREFIX}_encrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_encrypt"); + +# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key); +&aesni_generate1("dec") if(!$inline); +&function_begin_B("${PREFIX}_decrypt"); + &mov ("eax",&wparam(0)); + &mov ($key,&wparam(2)); + &movups ($inout0,&QWP(0,"eax")); + &mov ($rounds,&DWP(240,$key)); + &mov ("eax",&wparam(1)); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &movups (&QWP(0,"eax"),$inout0); + &ret (); +&function_end_B("${PREFIX}_decrypt"); + +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal +# utilization, i.e. when subroutine's throughput is virtually same as +# of non-interleaved subroutine [for number of input blocks up to 3]. +# This is why it makes no sense to implement 2x subroutine. +# aes[enc|dec] latency in next processor generation is 8, but the +# instructions can be scheduled every cycle. Optimal interleave for +# new processor is therefore 8x, but it's unfeasible to accommodate it +# in XMM registers addreassable in 32-bit mode and therefore 6x is +# used instead... + +sub aesni_generate3 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt3"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + + &set_label("${p}3_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}3_loop")); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt3"); +} + +# 4x interleave is implemented to improve small block performance, +# most notably [and naturally] 4 block by ~30%. One can argue that one +# should have implemented 5x as well, but improvement would be <20%, +# so it's not worth it... +sub aesni_generate4 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt4"); + &$movekey ($rndkey0,&QWP(0,$key)); + &$movekey ($rndkey1,&QWP(16,$key)); + &shr ($rounds,1); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &pxor ($inout2,$rndkey0); + &pxor ($inout3,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + + &set_label("${p}4_loop"); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}4_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt4"); +} + +sub aesni_generate6 +{ my $p=shift; + + &function_begin_B("_aesni_${p}rypt6"); + &static_label("_aesni_${p}rypt6_enter"); + &$movekey ($rndkey0,&QWP(0,$key)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key)); + &lea ($key,&DWP(32,$key)); + &xorps ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); # pxor does better here + eval"&aes${p} ($inout0,$rndkey1)"; + &pxor ($inout2,$rndkey0); + eval"&aes${p} ($inout1,$rndkey1)"; + &pxor ($inout3,$rndkey0); + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + &pxor ($inout4,$rndkey0); + eval"&aes${p} ($inout3,$rndkey1)"; + &pxor ($inout5,$rndkey0); + eval"&aes${p} ($inout4,$rndkey1)"; + &$movekey ($rndkey0,&QWP(0,$key)); + eval"&aes${p} ($inout5,$rndkey1)"; + &jmp (&label("_aesni_${p}rypt6_enter")); + + &set_label("${p}6_loop",16); + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + &dec ($rounds); + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + &set_label("_aesni_${p}rypt6_enter",16); + &$movekey ($rndkey1,&QWP(16,$key)); + eval"&aes${p} ($inout0,$rndkey0)"; + eval"&aes${p} ($inout1,$rndkey0)"; + &lea ($key,&DWP(32,$key)); + eval"&aes${p} ($inout2,$rndkey0)"; + eval"&aes${p} ($inout3,$rndkey0)"; + eval"&aes${p} ($inout4,$rndkey0)"; + eval"&aes${p} ($inout5,$rndkey0)"; + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("${p}6_loop")); + + eval"&aes${p} ($inout0,$rndkey1)"; + eval"&aes${p} ($inout1,$rndkey1)"; + eval"&aes${p} ($inout2,$rndkey1)"; + eval"&aes${p} ($inout3,$rndkey1)"; + eval"&aes${p} ($inout4,$rndkey1)"; + eval"&aes${p} ($inout5,$rndkey1)"; + eval"&aes${p}last ($inout0,$rndkey0)"; + eval"&aes${p}last ($inout1,$rndkey0)"; + eval"&aes${p}last ($inout2,$rndkey0)"; + eval"&aes${p}last ($inout3,$rndkey0)"; + eval"&aes${p}last ($inout4,$rndkey0)"; + eval"&aes${p}last ($inout5,$rndkey0)"; + &ret(); + &function_end_B("_aesni_${p}rypt6"); +} +&aesni_generate3("enc") if ($PREFIX eq "aesni"); +&aesni_generate3("dec"); +&aesni_generate4("enc") if ($PREFIX eq "aesni"); +&aesni_generate4("dec"); +&aesni_generate6("enc") if ($PREFIX eq "aesni"); +&aesni_generate6("dec"); + +if ($PREFIX eq "aesni") { +###################################################################### +# void aesni_ecb_encrypt (const void *in, void *out, +# size_t length, const AES_KEY *key, +# int enc); +&function_begin("aesni_ecb_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &and ($len,-16); + &jz (&label("ecb_ret")); + &mov ($rounds,&DWP(240,$key)); + &test ($rounds_,$rounds_); + &jz (&label("ecb_decrypt")); + + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &cmp ($len,0x60); + &jb (&label("ecb_enc_tail")); + + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &sub ($len,0x60); + &jmp (&label("ecb_enc_loop6_enter")); + +&set_label("ecb_enc_loop6",16); + &movups (&QWP(0,$out),$inout0); + &movdqu ($inout0,&QWP(0,$inp)); + &movups (&QWP(0x10,$out),$inout1); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movups (&QWP(0x30,$out),$inout3); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movups (&QWP(0x40,$out),$inout4); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); +&set_label("ecb_enc_loop6_enter"); + + &call ("_aesni_encrypt6"); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + &sub ($len,0x60); + &jnc (&label("ecb_enc_loop6")); + + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &add ($len,0x60); + &jz (&label("ecb_ret")); + +&set_label("ecb_enc_tail"); + &movups ($inout0,&QWP(0,$inp)); + &cmp ($len,0x20); + &jb (&label("ecb_enc_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &je (&label("ecb_enc_two")); + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x40); + &jb (&label("ecb_enc_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &je (&label("ecb_enc_four")); + &movups ($inout4,&QWP(0x40,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_encrypt6"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + jmp (&label("ecb_ret")); + +&set_label("ecb_enc_one",16); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_two",16); + &xorps ($inout2,$inout2); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_three",16); + &call ("_aesni_encrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + +&set_label("ecb_enc_four",16); + &call ("_aesni_encrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &jmp (&label("ecb_ret")); +###################################################################### +&set_label("ecb_decrypt",16); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &cmp ($len,0x60); + &jb (&label("ecb_dec_tail")); + + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &sub ($len,0x60); + &jmp (&label("ecb_dec_loop6_enter")); + +&set_label("ecb_dec_loop6",16); + &movups (&QWP(0,$out),$inout0); + &movdqu ($inout0,&QWP(0,$inp)); + &movups (&QWP(0x10,$out),$inout1); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movups (&QWP(0x30,$out),$inout3); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movups (&QWP(0x40,$out),$inout4); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &movdqu ($inout5,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); +&set_label("ecb_dec_loop6_enter"); + + &call ("_aesni_decrypt6"); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + &sub ($len,0x60); + &jnc (&label("ecb_dec_loop6")); + + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + &add ($len,0x60); + &jz (&label("ecb_ret")); + +&set_label("ecb_dec_tail"); + &movups ($inout0,&QWP(0,$inp)); + &cmp ($len,0x20); + &jb (&label("ecb_dec_one")); + &movups ($inout1,&QWP(0x10,$inp)); + &je (&label("ecb_dec_two")); + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x40); + &jb (&label("ecb_dec_three")); + &movups ($inout3,&QWP(0x30,$inp)); + &je (&label("ecb_dec_four")); + &movups ($inout4,&QWP(0x40,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_decrypt6"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_one",16); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &movups (&QWP(0,$out),$inout0); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_two",16); + &xorps ($inout2,$inout2); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_three",16); + &call ("_aesni_decrypt3"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ecb_ret")); + +&set_label("ecb_dec_four",16); + &call ("_aesni_decrypt4"); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + +&set_label("ecb_ret"); +&function_end("aesni_ecb_encrypt"); + +###################################################################### +# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec,char *cmac); +# +# Handles only complete blocks, operates on 64-bit counter and +# does not update *ivec! Nor does it finalize CMAC value +# (see engine/eng_aesni.c for details) +# +{ my $cmac=$inout1; +&function_begin("aesni_ccm64_encrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($rounds,&wparam(5)); + &mov ($key_,"esp"); + &sub ("esp",60); + &and ("esp",-16); # align stack + &mov (&DWP(48,"esp"),$key_); + + &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec + &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds_,1); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds_); + &mov (&DWP(20,"esp"),$key_); + &mov (&DWP(24,"esp"),$key_); + &mov (&DWP(28,"esp"),$key_); + + &shr ($rounds,1); + &lea ($key_,&DWP(0,$key)); + &movdqa ($inout3,&QWP(0,"esp")); + &movdqa ($inout0,$ivec); + &mov ($rounds_,$rounds); + &pshufb ($ivec,$inout3); + +&set_label("ccm64_enc_outer"); + &$movekey ($rndkey0,&QWP(0,$key_)); + &mov ($rounds,$rounds_); + &movups ($in0,&QWP(0,$inp)); + + &xorps ($inout0,$rndkey0); + &$movekey ($rndkey1,&QWP(16,$key_)); + &xorps ($rndkey0,$in0); + &lea ($key,&DWP(32,$key_)); + &xorps ($cmac,$rndkey0); # cmac^=inp + &$movekey ($rndkey0,&QWP(0,$key)); + +&set_label("ccm64_enc2_loop"); + &aesenc ($inout0,$rndkey1); + &dec ($rounds); + &aesenc ($cmac,$rndkey1); + &$movekey ($rndkey1,&QWP(16,$key)); + &aesenc ($inout0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &aesenc ($cmac,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("ccm64_enc2_loop")); + &aesenc ($inout0,$rndkey1); + &aesenc ($cmac,$rndkey1); + &paddq ($ivec,&QWP(16,"esp")); + &aesenclast ($inout0,$rndkey0); + &aesenclast ($cmac,$rndkey0); + + &dec ($len); + &lea ($inp,&DWP(16,$inp)); + &xorps ($in0,$inout0); # inp^=E(ivec) + &movdqa ($inout0,$ivec); + &movups (&QWP(0,$out),$in0); # save output + &lea ($out,&DWP(16,$out)); + &pshufb ($inout0,$inout3); + &jnz (&label("ccm64_enc_outer")); + + &mov ("esp",&DWP(48,"esp")); + &mov ($out,&wparam(5)); + &movups (&QWP(0,$out),$cmac); +&function_end("aesni_ccm64_encrypt_blocks"); + +&function_begin("aesni_ccm64_decrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($rounds,&wparam(5)); + &mov ($key_,"esp"); + &sub ("esp",60); + &and ("esp",-16); # align stack + &mov (&DWP(48,"esp"),$key_); + + &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec + &movdqu ($cmac,&QWP(0,$rounds)); # load cmac + &mov ($rounds,&DWP(240,$key)); + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds_,1); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds_); + &mov (&DWP(20,"esp"),$key_); + &mov (&DWP(24,"esp"),$key_); + &mov (&DWP(28,"esp"),$key_); + + &movdqa ($inout3,&QWP(0,"esp")); # bswap mask + &movdqa ($inout0,$ivec); + + &mov ($key_,$key); + &mov ($rounds_,$rounds); + + &pshufb ($ivec,$inout3); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups ($in0,&QWP(0,$inp)); # load inp + &paddq ($ivec,&QWP(16,"esp")); + &lea ($inp,&QWP(16,$inp)); + &jmp (&label("ccm64_dec_outer")); + +&set_label("ccm64_dec_outer",16); + &xorps ($in0,$inout0); # inp ^= E(ivec) + &movdqa ($inout0,$ivec); + &mov ($rounds,$rounds_); + &movups (&QWP(0,$out),$in0); # save output + &lea ($out,&DWP(16,$out)); + &pshufb ($inout0,$inout3); + + &sub ($len,1); + &jz (&label("ccm64_dec_break")); + + &$movekey ($rndkey0,&QWP(0,$key_)); + &shr ($rounds,1); + &$movekey ($rndkey1,&QWP(16,$key_)); + &xorps ($in0,$rndkey0); + &lea ($key,&DWP(32,$key_)); + &xorps ($inout0,$rndkey0); + &xorps ($cmac,$in0); # cmac^=out + &$movekey ($rndkey0,&QWP(0,$key)); + +&set_label("ccm64_dec2_loop"); + &aesenc ($inout0,$rndkey1); + &dec ($rounds); + &aesenc ($cmac,$rndkey1); + &$movekey ($rndkey1,&QWP(16,$key)); + &aesenc ($inout0,$rndkey0); + &lea ($key,&DWP(32,$key)); + &aesenc ($cmac,$rndkey0); + &$movekey ($rndkey0,&QWP(0,$key)); + &jnz (&label("ccm64_dec2_loop")); + &movups ($in0,&QWP(0,$inp)); # load inp + &paddq ($ivec,&QWP(16,"esp")); + &aesenc ($inout0,$rndkey1); + &aesenc ($cmac,$rndkey1); + &lea ($inp,&QWP(16,$inp)); + &aesenclast ($inout0,$rndkey0); + &aesenclast ($cmac,$rndkey0); + &jmp (&label("ccm64_dec_outer")); + +&set_label("ccm64_dec_break",16); + &mov ($key,$key_); + if ($inline) + { &aesni_inline_generate1("enc",$cmac,$in0); } + else + { &call ("_aesni_encrypt1",$cmac); } + + &mov ("esp",&DWP(48,"esp")); + &mov ($out,&wparam(5)); + &movups (&QWP(0,$out),$cmac); +&function_end("aesni_ccm64_decrypt_blocks"); +} + +###################################################################### +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +# +# Handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see engine/eng_aesni.c for details) +# +# stack layout: +# 0 pshufb mask +# 16 vector addend: 0,6,6,6 +# 32 counter-less ivec +# 48 1st triplet of counter vector +# 64 2nd triplet of counter vector +# 80 saved %esp + +&function_begin("aesni_ctr32_encrypt_blocks"); + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); + &mov ($rounds_,&wparam(4)); + &mov ($key_,"esp"); + &sub ("esp",88); + &and ("esp",-16); # align stack + &mov (&DWP(80,"esp"),$key_); + + &cmp ($len,1); + &je (&label("ctr32_one_shortcut")); + + &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec + + # compose byte-swap control mask for pshufb on stack + &mov (&DWP(0,"esp"),0x0c0d0e0f); + &mov (&DWP(4,"esp"),0x08090a0b); + &mov (&DWP(8,"esp"),0x04050607); + &mov (&DWP(12,"esp"),0x00010203); + + # compose counter increment vector on stack + &mov ($rounds,6); + &xor ($key_,$key_); + &mov (&DWP(16,"esp"),$rounds); + &mov (&DWP(20,"esp"),$rounds); + &mov (&DWP(24,"esp"),$rounds); + &mov (&DWP(28,"esp"),$key_); + + &pextrd ($rounds_,$inout5,3); # pull 32-bit counter + &pinsrd ($inout5,$key_,3); # wipe 32-bit counter + + &mov ($rounds,&DWP(240,$key)); # key->rounds + + # compose 2 vectors of 3x32-bit counters + &bswap ($rounds_); + &pxor ($rndkey1,$rndkey1); + &pxor ($rndkey0,$rndkey0); + &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask + &pinsrd ($rndkey1,$rounds_,0); + &lea ($key_,&DWP(3,$rounds_)); + &pinsrd ($rndkey0,$key_,0); + &inc ($rounds_); + &pinsrd ($rndkey1,$rounds_,1); + &inc ($key_); + &pinsrd ($rndkey0,$key_,1); + &inc ($rounds_); + &pinsrd ($rndkey1,$rounds_,2); + &inc ($key_); + &pinsrd ($rndkey0,$key_,2); + &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet + &pshufb ($rndkey1,$inout0); # byte swap + &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pshufb ($rndkey0,$inout0); # byte swap + + &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword + &pshufd ($inout1,$rndkey1,2<<6); + &cmp ($len,6); + &jb (&label("ctr32_tail")); + &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec + &shr ($rounds,1); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &sub ($len,6); + &jmp (&label("ctr32_loop6")); + +&set_label("ctr32_loop6",16); + &pshufd ($inout2,$rndkey1,1<<6); + &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec + &pshufd ($inout3,$rndkey0,3<<6); + &por ($inout0,$rndkey1); # merge counter-less ivec + &pshufd ($inout4,$rndkey0,2<<6); + &por ($inout1,$rndkey1); + &pshufd ($inout5,$rndkey0,1<<6); + &por ($inout2,$rndkey1); + &por ($inout3,$rndkey1); + &por ($inout4,$rndkey1); + &por ($inout5,$rndkey1); + + # inlining _aesni_encrypt6's prologue gives ~4% improvement... + &$movekey ($rndkey0,&QWP(0,$key_)); + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &dec ($rounds); + &pxor ($inout0,$rndkey0); + &pxor ($inout1,$rndkey0); + &aesenc ($inout0,$rndkey1); + &pxor ($inout2,$rndkey0); + &aesenc ($inout1,$rndkey1); + &pxor ($inout3,$rndkey0); + &aesenc ($inout2,$rndkey1); + &pxor ($inout4,$rndkey0); + &aesenc ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesenc ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesenc ($inout5,$rndkey1); + + &call (&label("_aesni_encrypt6_enter")); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movdqa ($rndkey0,&QWP(16,"esp")); # load increment + &xorps ($inout2,$rndkey1); + &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + + &paddd ($rndkey1,$rndkey0); # 1st triplet increment + &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment + &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask + + &movups ($inout1,&QWP(0x30,$inp)); + &movups ($inout2,&QWP(0x40,$inp)); + &xorps ($inout3,$inout1); + &movups ($inout1,&QWP(0x50,$inp)); + &lea ($inp,&DWP(0x60,$inp)); + &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet + &pshufb ($rndkey1,$inout0); # byte swap + &xorps ($inout4,$inout2); + &movups (&QWP(0x30,$out),$inout3); + &xorps ($inout5,$inout1); + &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet + &pshufb ($rndkey0,$inout0); # byte swap + &movups (&QWP(0x40,$out),$inout4); + &pshufd ($inout0,$rndkey1,3<<6); + &movups (&QWP(0x50,$out),$inout5); + &lea ($out,&DWP(0x60,$out)); + + &mov ($rounds,$rounds_); + &pshufd ($inout1,$rndkey1,2<<6); + &sub ($len,6); + &jnc (&label("ctr32_loop6")); + + &add ($len,6); + &jz (&label("ctr32_ret")); + &mov ($key,$key_); + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec + +&set_label("ctr32_tail"); + &por ($inout0,$inout5); + &cmp ($len,2); + &jb (&label("ctr32_one")); + + &pshufd ($inout2,$rndkey1,1<<6); + &por ($inout1,$inout5); + &je (&label("ctr32_two")); + + &pshufd ($inout3,$rndkey0,3<<6); + &por ($inout2,$inout5); + &cmp ($len,4); + &jb (&label("ctr32_three")); + + &pshufd ($inout4,$rndkey0,2<<6); + &por ($inout3,$inout5); + &je (&label("ctr32_four")); + + &por ($inout4,$inout5); + &call ("_aesni_encrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout1,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout2,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout4,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &movups (&QWP(0x40,$out),$inout4); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_one_shortcut",16); + &movups ($inout0,&QWP(0,$rounds_)); # load ivec + &mov ($rounds,&DWP(240,$key)); + +&set_label("ctr32_one"); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &movups ($in0,&QWP(0,$inp)); + &xorps ($in0,$inout0); + &movups (&QWP(0,$out),$in0); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_two",16); + &call ("_aesni_encrypt3"); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_three",16); + &call ("_aesni_encrypt3"); + &movups ($inout3,&QWP(0,$inp)); + &movups ($inout4,&QWP(0x10,$inp)); + &xorps ($inout0,$inout3); + &movups ($inout5,&QWP(0x20,$inp)); + &xorps ($inout1,$inout4); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$inout5); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &jmp (&label("ctr32_ret")); + +&set_label("ctr32_four",16); + &call ("_aesni_encrypt4"); + &movups ($inout4,&QWP(0,$inp)); + &movups ($inout5,&QWP(0x10,$inp)); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout0,$inout4); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout1,$inout5); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + +&set_label("ctr32_ret"); + &mov ("esp",&DWP(80,"esp")); +&function_end("aesni_ctr32_encrypt_blocks"); + +###################################################################### +# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2 +# const unsigned char iv[16]); +# +{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1); + +&function_begin("aesni_xts_encrypt"); + &mov ($key,&wparam(4)); # key2 + &mov ($inp,&wparam(5)); # clear-text tweak + + &mov ($rounds,&DWP(240,$key)); # key2->rounds + &movups ($inout0,&QWP(0,$inp)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); # key1 + + &mov ($key_,"esp"); + &sub ("esp",16*7+8); + &mov ($rounds,&DWP(240,$key)); # key1->rounds + &and ("esp",-16); # align stack + + &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant + &mov (&DWP(16*6+4,"esp"),0); + &mov (&DWP(16*6+8,"esp"),1); + &mov (&DWP(16*6+12,"esp"),0); + &mov (&DWP(16*7+0,"esp"),$len); # save original $len + &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp + + &movdqa ($tweak,$inout0); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + + &and ($len,-16); + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + &sub ($len,16*6); + &jc (&label("xts_enc_short")); + + &shr ($rounds,1); + &mov ($rounds_,$rounds); + &jmp (&label("xts_enc_loop6")); + +&set_label("xts_enc_loop6",16); + for ($i=0;$i<4;$i++) { + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa (&QWP(16*$i,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + } + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*$i++,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &$movekey ($rndkey0,&QWP(0,$key_)); + &pand ($inout5,$twmask); # isolate carry and residue + &movups ($inout0,&QWP(0,$inp)); # load input + &pxor ($inout5,$tweak); + + # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &movdqu ($inout1,&QWP(16*1,$inp)); + &xorps ($inout0,$rndkey0); # input^=rndkey[0] + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout1,$rndkey0); + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout2,$rndkey0); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout3,$rndkey0); + &movdqu ($rndkey1,&QWP(16*5,$inp)); + &pxor ($inout4,$rndkey0); + &lea ($inp,&DWP(16*6,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak + &pxor ($inout5,$rndkey1); + + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &pxor ($inout1,&QWP(16*1,"esp")); + &aesenc ($inout0,$rndkey1); + &pxor ($inout2,&QWP(16*2,"esp")); + &aesenc ($inout1,$rndkey1); + &pxor ($inout3,&QWP(16*3,"esp")); + &dec ($rounds); + &aesenc ($inout2,$rndkey1); + &pxor ($inout4,&QWP(16*4,"esp")); + &aesenc ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesenc ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesenc ($inout5,$rndkey1); + &call (&label("_aesni_encrypt6_enter")); + + &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak + &pxor ($twtmp,$twtmp); + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &xorps ($inout1,&QWP(16*1,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*2,$out),$inout2); + &xorps ($inout4,&QWP(16*4,"esp")); + &movups (&QWP(16*3,$out),$inout3); + &xorps ($inout5,$tweak); + &movups (&QWP(16*4,$out),$inout4); + &pshufd ($twres,$twtmp,0x13); + &movups (&QWP(16*5,$out),$inout5); + &lea ($out,&DWP(16*6,$out)); + &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 + + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov ($rounds,$rounds_); # restore $rounds + &pxor ($tweak,$twres); + + &sub ($len,16*6); + &jnc (&label("xts_enc_loop6")); + + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($key,$key_); # restore $key + &mov ($rounds_,$rounds); + +&set_label("xts_enc_short"); + &add ($len,16*6); + &jz (&label("xts_enc_done6x")); + + &movdqa ($inout3,$tweak); # put aside previous tweak + &cmp ($len,0x20); + &jb (&label("xts_enc_one")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &je (&label("xts_enc_two")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &cmp ($len,0x40); + &jb (&label("xts_enc_three")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout5,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &movdqa (&QWP(16*0,"esp"),$inout3); + &movdqa (&QWP(16*1,"esp"),$inout4); + &je (&label("xts_enc_four")); + + &movdqa (&QWP(16*2,"esp"),$inout5); + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*3,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($inout0,1); + &pand ($inout5,$twmask); # isolate carry and residue + &pxor ($inout5,$tweak); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout1,&QWP(16*1,"esp")); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout2,&QWP(16*2,"esp")); + &lea ($inp,&DWP(16*5,$inp)); + &pxor ($inout3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak + &pxor ($inout4,$inout5); + + &call ("_aesni_encrypt6"); + + &movaps ($tweak,&QWP(16*4,"esp")); # last tweak + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout4,$tweak); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &movups (&QWP(16*4,$out),$inout4); + &lea ($out,&DWP(16*5,$out)); + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_one",16); + &movups ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16*1,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(16*0,$out),$inout0); # write output + &lea ($out,&DWP(16*1,$out)); + + &movdqa ($tweak,$inout3); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_two",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &lea ($inp,&DWP(16*2,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout2); + + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &lea ($out,&DWP(16*2,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_three",16); + &movaps ($inout5,$tweak); # put aside last tweak + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &lea ($inp,&DWP(16*3,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + + &call ("_aesni_encrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &lea ($out,&DWP(16*3,$out)); + + &movdqa ($tweak,$inout5); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_four",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movups ($inout3,&QWP(16*3,$inp)); + &lea ($inp,&DWP(16*4,$inp)); + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &xorps ($inout3,$inout4); + + &call ("_aesni_encrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,$inout4); + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &lea ($out,&DWP(16*4,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_enc_done")); + +&set_label("xts_enc_done6x",16); # $tweak is pre-calculated + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &and ($len,15); + &jz (&label("xts_enc_ret")); + &movdqa ($inout3,$tweak); + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &jmp (&label("xts_enc_steal")); + +&set_label("xts_enc_done",16); + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &pxor ($twtmp,$twtmp); + &and ($len,15); + &jz (&label("xts_enc_ret")); + + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &pshufd ($inout3,$twtmp,0x13); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue + &pxor ($inout3,$tweak); + +&set_label("xts_enc_steal"); + &movz ($rounds,&BP(0,$inp)); + &movz ($key,&BP(-16,$out)); + &lea ($inp,&DWP(1,$inp)); + &mov (&BP(-16,$out),&LB($rounds)); + &mov (&BP(0,$out),&LB($key)); + &lea ($out,&DWP(1,$out)); + &sub ($len,1); + &jnz (&label("xts_enc_steal")); + + &sub ($out,&DWP(16*7+0,"esp")); # rewind $out + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(-16,$out)); # load input + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(-16,$out),$inout0); # write output + +&set_label("xts_enc_ret"); + &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp +&function_end("aesni_xts_encrypt"); + +&function_begin("aesni_xts_decrypt"); + &mov ($key,&wparam(4)); # key2 + &mov ($inp,&wparam(5)); # clear-text tweak + + &mov ($rounds,&DWP(240,$key)); # key2->rounds + &movups ($inout0,&QWP(0,$inp)); + if ($inline) + { &aesni_inline_generate1("enc"); } + else + { &call ("_aesni_encrypt1"); } + + &mov ($inp,&wparam(0)); + &mov ($out,&wparam(1)); + &mov ($len,&wparam(2)); + &mov ($key,&wparam(3)); # key1 + + &mov ($key_,"esp"); + &sub ("esp",16*7+8); + &and ("esp",-16); # align stack + + &xor ($rounds_,$rounds_); # if(len%16) len-=16; + &test ($len,15); + &setnz (&LB($rounds_)); + &shl ($rounds_,4); + &sub ($len,$rounds_); + + &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant + &mov (&DWP(16*6+4,"esp"),0); + &mov (&DWP(16*6+8,"esp"),1); + &mov (&DWP(16*6+12,"esp"),0); + &mov (&DWP(16*7+0,"esp"),$len); # save original $len + &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp + + &mov ($rounds,&DWP(240,$key)); # key1->rounds + &mov ($key_,$key); # backup $key + &mov ($rounds_,$rounds); # backup $rounds + + &movdqa ($tweak,$inout0); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87 + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + + &and ($len,-16); + &sub ($len,16*6); + &jc (&label("xts_dec_short")); + + &shr ($rounds,1); + &mov ($rounds_,$rounds); + &jmp (&label("xts_dec_loop6")); + +&set_label("xts_dec_loop6",16); + for ($i=0;$i<4;$i++) { + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa (&QWP(16*$i,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + } + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*$i++,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &$movekey ($rndkey0,&QWP(0,$key_)); + &pand ($inout5,$twmask); # isolate carry and residue + &movups ($inout0,&QWP(0,$inp)); # load input + &pxor ($inout5,$tweak); + + # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0] + &movdqu ($inout1,&QWP(16*1,$inp)); + &xorps ($inout0,$rndkey0); # input^=rndkey[0] + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout1,$rndkey0); + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout2,$rndkey0); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout3,$rndkey0); + &movdqu ($rndkey1,&QWP(16*5,$inp)); + &pxor ($inout4,$rndkey0); + &lea ($inp,&DWP(16*6,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak + &pxor ($inout5,$rndkey1); + + &$movekey ($rndkey1,&QWP(16,$key_)); + &lea ($key,&DWP(32,$key_)); + &pxor ($inout1,&QWP(16*1,"esp")); + &aesdec ($inout0,$rndkey1); + &pxor ($inout2,&QWP(16*2,"esp")); + &aesdec ($inout1,$rndkey1); + &pxor ($inout3,&QWP(16*3,"esp")); + &dec ($rounds); + &aesdec ($inout2,$rndkey1); + &pxor ($inout4,&QWP(16*4,"esp")); + &aesdec ($inout3,$rndkey1); + &pxor ($inout5,$rndkey0); + &aesdec ($inout4,$rndkey1); + &$movekey ($rndkey0,&QWP(0,$key)); + &aesdec ($inout5,$rndkey1); + &call (&label("_aesni_decrypt6_enter")); + + &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak + &pxor ($twtmp,$twtmp); + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &pcmpgtd ($twtmp,$tweak); # broadcast upper bits + &xorps ($inout1,&QWP(16*1,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*2,$out),$inout2); + &xorps ($inout4,&QWP(16*4,"esp")); + &movups (&QWP(16*3,$out),$inout3); + &xorps ($inout5,$tweak); + &movups (&QWP(16*4,$out),$inout4); + &pshufd ($twres,$twtmp,0x13); + &movups (&QWP(16*5,$out),$inout5); + &lea ($out,&DWP(16*6,$out)); + &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87 + + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov ($rounds,$rounds_); # restore $rounds + &pxor ($tweak,$twres); + + &sub ($len,16*6); + &jnc (&label("xts_dec_loop6")); + + &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds + &mov ($key,$key_); # restore $key + &mov ($rounds_,$rounds); + +&set_label("xts_dec_short"); + &add ($len,16*6); + &jz (&label("xts_dec_done6x")); + + &movdqa ($inout3,$tweak); # put aside previous tweak + &cmp ($len,0x20); + &jb (&label("xts_dec_one")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &je (&label("xts_dec_two")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &cmp ($len,0x40); + &jb (&label("xts_dec_three")); + + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($inout5,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + &movdqa (&QWP(16*0,"esp"),$inout3); + &movdqa (&QWP(16*1,"esp"),$inout4); + &je (&label("xts_dec_four")); + + &movdqa (&QWP(16*2,"esp"),$inout5); + &pshufd ($inout5,$twtmp,0x13); + &movdqa (&QWP(16*3,"esp"),$tweak); + &paddq ($tweak,$tweak); # &psllq($inout0,1); + &pand ($inout5,$twmask); # isolate carry and residue + &pxor ($inout5,$tweak); + + &movdqu ($inout0,&QWP(16*0,$inp)); # load input + &movdqu ($inout1,&QWP(16*1,$inp)); + &movdqu ($inout2,&QWP(16*2,$inp)); + &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movdqu ($inout3,&QWP(16*3,$inp)); + &pxor ($inout1,&QWP(16*1,"esp")); + &movdqu ($inout4,&QWP(16*4,$inp)); + &pxor ($inout2,&QWP(16*2,"esp")); + &lea ($inp,&DWP(16*5,$inp)); + &pxor ($inout3,&QWP(16*3,"esp")); + &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak + &pxor ($inout4,$inout5); + + &call ("_aesni_decrypt6"); + + &movaps ($tweak,&QWP(16*4,"esp")); # last tweak + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,&QWP(16*2,"esp")); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,&QWP(16*3,"esp")); + &movups (&QWP(16*1,$out),$inout1); + &xorps ($inout4,$tweak); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &movups (&QWP(16*4,$out),$inout4); + &lea ($out,&DWP(16*5,$out)); + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_one",16); + &movups ($inout0,&QWP(16*0,$inp)); # load input + &lea ($inp,&DWP(16*1,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(16*0,$out),$inout0); # write output + &lea ($out,&DWP(16*1,$out)); + + &movdqa ($tweak,$inout3); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_two",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &lea ($inp,&DWP(16*2,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + + &call ("_aesni_decrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &lea ($out,&DWP(16*2,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_three",16); + &movaps ($inout5,$tweak); # put aside last tweak + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &lea ($inp,&DWP(16*3,$inp)); + &xorps ($inout0,$inout3); # input^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + + &call ("_aesni_decrypt3"); + + &xorps ($inout0,$inout3); # output^=tweak + &xorps ($inout1,$inout4); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &lea ($out,&DWP(16*3,$out)); + + &movdqa ($tweak,$inout5); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_four",16); + &movaps ($inout4,$tweak); # put aside last tweak + + &movups ($inout0,&QWP(16*0,$inp)); # load input + &movups ($inout1,&QWP(16*1,$inp)); + &movups ($inout2,&QWP(16*2,$inp)); + &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak + &movups ($inout3,&QWP(16*3,$inp)); + &lea ($inp,&DWP(16*4,$inp)); + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &xorps ($inout3,$inout4); + + &call ("_aesni_decrypt4"); + + &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak + &xorps ($inout1,&QWP(16*1,"esp")); + &xorps ($inout2,$inout5); + &movups (&QWP(16*0,$out),$inout0); # write output + &xorps ($inout3,$inout4); + &movups (&QWP(16*1,$out),$inout1); + &movups (&QWP(16*2,$out),$inout2); + &movups (&QWP(16*3,$out),$inout3); + &lea ($out,&DWP(16*4,$out)); + + &movdqa ($tweak,$inout4); # last tweak + &jmp (&label("xts_dec_done")); + +&set_label("xts_dec_done6x",16); # $tweak is pre-calculated + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &and ($len,15); + &jz (&label("xts_dec_ret")); + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &jmp (&label("xts_dec_only_one_more")); + +&set_label("xts_dec_done",16); + &mov ($len,&DWP(16*7+0,"esp")); # restore original $len + &pxor ($twtmp,$twtmp); + &and ($len,15); + &jz (&label("xts_dec_ret")); + + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &mov (&DWP(16*7+0,"esp"),$len); # save $len%16 + &pshufd ($twres,$twtmp,0x13); + &pxor ($twtmp,$twtmp); + &movdqa ($twmask,&QWP(16*6,"esp")); + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($twres,$twmask); # isolate carry and residue + &pcmpgtd($twtmp,$tweak); # broadcast upper bits + &pxor ($tweak,$twres); + +&set_label("xts_dec_only_one_more"); + &pshufd ($inout3,$twtmp,0x13); + &movdqa ($inout4,$tweak); # put aside previous tweak + &paddq ($tweak,$tweak); # &psllq($tweak,1); + &pand ($inout3,$twmask); # isolate carry and residue + &pxor ($inout3,$tweak); + + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(0,$inp)); # load input + &xorps ($inout0,$inout3); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout3); # output^=tweak + &movups (&QWP(0,$out),$inout0); # write output + +&set_label("xts_dec_steal"); + &movz ($rounds,&BP(16,$inp)); + &movz ($key,&BP(0,$out)); + &lea ($inp,&DWP(1,$inp)); + &mov (&BP(0,$out),&LB($rounds)); + &mov (&BP(16,$out),&LB($key)); + &lea ($out,&DWP(1,$out)); + &sub ($len,1); + &jnz (&label("xts_dec_steal")); + + &sub ($out,&DWP(16*7+0,"esp")); # rewind $out + &mov ($key,$key_); # restore $key + &mov ($rounds,$rounds_); # restore $rounds + + &movups ($inout0,&QWP(0,$out)); # load input + &xorps ($inout0,$inout4); # input^=tweak + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$inout4); # output^=tweak + &movups (&QWP(0,$out),$inout0); # write output + +&set_label("xts_dec_ret"); + &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp +&function_end("aesni_xts_decrypt"); +} +} + +###################################################################### +# void $PREFIX_cbc_encrypt (const void *inp, void *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); + &mov ($rounds_,"esp"); + &mov ($out,&wparam(1)); + &sub ($rounds_,24); + &mov ($len,&wparam(2)); + &and ($rounds_,-16); + &mov ($key,&wparam(3)); + &mov ($key_,&wparam(4)); + &test ($len,$len); + &jz (&label("cbc_abort")); + + &cmp (&wparam(5),0); + &xchg ($rounds_,"esp"); # alloca + &movups ($ivec,&QWP(0,$key_)); # load IV + &mov ($rounds,&DWP(240,$key)); + &mov ($key_,$key); # backup $key + &mov (&DWP(16,"esp"),$rounds_); # save original %esp + &mov ($rounds_,$rounds); # backup $rounds + &je (&label("cbc_decrypt")); + + &movaps ($inout0,$ivec); + &cmp ($len,16); + &jb (&label("cbc_enc_tail")); + &sub ($len,16); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movups ($ivec,&QWP(0,$inp)); # input actually + &lea ($inp,&DWP(16,$inp)); + if ($inline) + { &aesni_inline_generate1("enc",$inout0,$ivec); } + else + { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); } + &mov ($rounds,$rounds_); # restore $rounds + &mov ($key,$key_); # restore $key + &movups (&QWP(0,$out),$inout0); # store output + &lea ($out,&DWP(16,$out)); + &sub ($len,16); + &jnc (&label("cbc_enc_loop")); + &add ($len,16); + &jnz (&label("cbc_enc_tail")); + &movaps ($ivec,$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_enc_tail"); + &mov ("ecx",$len); # zaps $rounds + &data_word(0xA4F3F689); # rep movsb + &mov ("ecx",16); # zero tail + &sub ("ecx",$len); + &xor ("eax","eax"); # zaps $len + &data_word(0xAAF3F689); # rep stosb + &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block + &mov ($rounds,$rounds_); # restore $rounds + &mov ($inp,$out); # $inp and $out are the same + &mov ($key,$key_); # restore $key + &jmp (&label("cbc_enc_loop")); +###################################################################### +&set_label("cbc_decrypt",16); + &cmp ($len,0x50); + &jbe (&label("cbc_dec_tail")); + &movaps (&QWP(0,"esp"),$ivec); # save IV + &sub ($len,0x50); + &jmp (&label("cbc_dec_loop6_enter")); + +&set_label("cbc_dec_loop6",16); + &movaps (&QWP(0,"esp"),$rndkey0); # save IV + &movups (&QWP(0,$out),$inout5); + &lea ($out,&DWP(0x10,$out)); +&set_label("cbc_dec_loop6_enter"); + &movdqu ($inout0,&QWP(0,$inp)); + &movdqu ($inout1,&QWP(0x10,$inp)); + &movdqu ($inout2,&QWP(0x20,$inp)); + &movdqu ($inout3,&QWP(0x30,$inp)); + &movdqu ($inout4,&QWP(0x40,$inp)); + &movdqu ($inout5,&QWP(0x50,$inp)); + + &call ("_aesni_decrypt6"); + + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,&QWP(0,"esp")); # ^=IV + &xorps ($inout1,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout2,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout3,$rndkey1); + &movups ($rndkey1,&QWP(0x40,$inp)); + &xorps ($inout4,$rndkey0); + &movups ($rndkey0,&QWP(0x50,$inp)); # IV + &xorps ($inout5,$rndkey1); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &lea ($inp,&DWP(0x60,$inp)); + &movups (&QWP(0x20,$out),$inout2); + &mov ($rounds,$rounds_) # restore $rounds + &movups (&QWP(0x30,$out),$inout3); + &mov ($key,$key_); # restore $key + &movups (&QWP(0x40,$out),$inout4); + &lea ($out,&DWP(0x50,$out)); + &sub ($len,0x60); + &ja (&label("cbc_dec_loop6")); + + &movaps ($inout0,$inout5); + &movaps ($ivec,$rndkey0); + &add ($len,0x50); + &jle (&label("cbc_dec_tail_collected")); + &movups (&QWP(0,$out),$inout0); + &lea ($out,&DWP(0x10,$out)); +&set_label("cbc_dec_tail"); + &movups ($inout0,&QWP(0,$inp)); + &movaps ($in0,$inout0); + &cmp ($len,0x10); + &jbe (&label("cbc_dec_one")); + + &movups ($inout1,&QWP(0x10,$inp)); + &movaps ($in1,$inout1); + &cmp ($len,0x20); + &jbe (&label("cbc_dec_two")); + + &movups ($inout2,&QWP(0x20,$inp)); + &cmp ($len,0x30); + &jbe (&label("cbc_dec_three")); + + &movups ($inout3,&QWP(0x30,$inp)); + &cmp ($len,0x40); + &jbe (&label("cbc_dec_four")); + + &movups ($inout4,&QWP(0x40,$inp)); + &movaps (&QWP(0,"esp"),$ivec); # save IV + &movups ($inout0,&QWP(0,$inp)); + &xorps ($inout5,$inout5); + &call ("_aesni_decrypt6"); + &movups ($rndkey1,&QWP(0,$inp)); + &movups ($rndkey0,&QWP(0x10,$inp)); + &xorps ($inout0,&QWP(0,"esp")); # ^= IV + &xorps ($inout1,$rndkey1); + &movups ($rndkey1,&QWP(0x20,$inp)); + &xorps ($inout2,$rndkey0); + &movups ($rndkey0,&QWP(0x30,$inp)); + &xorps ($inout3,$rndkey1); + &movups ($ivec,&QWP(0x40,$inp)); # IV + &xorps ($inout4,$rndkey0); + &movups (&QWP(0,$out),$inout0); + &movups (&QWP(0x10,$out),$inout1); + &movups (&QWP(0x20,$out),$inout2); + &movups (&QWP(0x30,$out),$inout3); + &lea ($out,&DWP(0x40,$out)); + &movaps ($inout0,$inout4); + &sub ($len,0x50); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_one",16); + if ($inline) + { &aesni_inline_generate1("dec"); } + else + { &call ("_aesni_decrypt1"); } + &xorps ($inout0,$ivec); + &movaps ($ivec,$in0); + &sub ($len,0x10); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_two",16); + &xorps ($inout2,$inout2); + &call ("_aesni_decrypt3"); + &xorps ($inout0,$ivec); + &xorps ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout1); + &lea ($out,&DWP(0x10,$out)); + &movaps ($ivec,$in1); + &sub ($len,0x20); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_three",16); + &call ("_aesni_decrypt3"); + &xorps ($inout0,$ivec); + &xorps ($inout1,$in0); + &xorps ($inout2,$in1); + &movups (&QWP(0,$out),$inout0); + &movaps ($inout0,$inout2); + &movups (&QWP(0x10,$out),$inout1); + &lea ($out,&DWP(0x20,$out)); + &movups ($ivec,&QWP(0x20,$inp)); + &sub ($len,0x30); + &jmp (&label("cbc_dec_tail_collected")); + +&set_label("cbc_dec_four",16); + &call ("_aesni_decrypt4"); + &movups ($rndkey1,&QWP(0x10,$inp)); + &movups ($rndkey0,&QWP(0x20,$inp)); + &xorps ($inout0,$ivec); + &movups ($ivec,&QWP(0x30,$inp)); + &xorps ($inout1,$in0); + &movups (&QWP(0,$out),$inout0); + &xorps ($inout2,$rndkey1); + &movups (&QWP(0x10,$out),$inout1); + &xorps ($inout3,$rndkey0); + &movups (&QWP(0x20,$out),$inout2); + &lea ($out,&DWP(0x30,$out)); + &movaps ($inout0,$inout3); + &sub ($len,0x40); + +&set_label("cbc_dec_tail_collected"); + &and ($len,15); + &jnz (&label("cbc_dec_tail_partial")); + &movups (&QWP(0,$out),$inout0); + &jmp (&label("cbc_ret")); + +&set_label("cbc_dec_tail_partial",16); + &movaps (&QWP(0,"esp"),$inout0); + &mov ("ecx",16); + &mov ($inp,"esp"); + &sub ("ecx",$len); + &data_word(0xA4F3F689); # rep movsb + +&set_label("cbc_ret"); + &mov ("esp",&DWP(16,"esp")); # pull original %esp + &mov ($key_,&wparam(4)); + &movups (&QWP(0,$key_),$ivec); # output IV +&set_label("cbc_abort"); +&function_end("${PREFIX}_cbc_encrypt"); + +###################################################################### +# Mechanical port from aesni-x86_64.pl. +# +# _aesni_set_encrypt_key is private interface, +# input: +# "eax" const unsigned char *userKey +# $rounds int bits +# $key AES_KEY *key +# output: +# "eax" return code +# $round rounds + +&function_begin_B("_aesni_set_encrypt_key"); + &test ("eax","eax"); + &jz (&label("bad_pointer")); + &test ($key,$key); + &jz (&label("bad_pointer")); + + &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey + &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0 + &lea ($key,&DWP(16,$key)); + &cmp ($rounds,256); + &je (&label("14rounds")); + &cmp ($rounds,192); + &je (&label("12rounds")); + &cmp ($rounds,128); + &jne (&label("bad_keybits")); + +&set_label("10rounds",16); + &mov ($rounds,9); + &$movekey (&QWP(-16,$key),"xmm0"); # round 0 + &aeskeygenassist("xmm1","xmm0",0x01); # round 1 + &call (&label("key_128_cold")); + &aeskeygenassist("xmm1","xmm0",0x2); # round 2 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 3 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 4 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 5 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 6 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x40); # round 7 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x80); # round 8 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x1b); # round 9 + &call (&label("key_128")); + &aeskeygenassist("xmm1","xmm0",0x36); # round 10 + &call (&label("key_128")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(80,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_128",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_128_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); + &ret(); + +&set_label("12rounds",16); + &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey + &mov ($rounds,11); + &$movekey (&QWP(-16,$key),"xmm0") # round 0 + &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2 + &call (&label("key_192a_cold")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9 + &call (&label("key_192b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11 + &call (&label("key_192a")); + &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12 + &call (&label("key_192b")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(48,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_192a",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); +&set_label("key_192a_cold",16); + &movaps ("xmm5","xmm2"); +&set_label("key_192b_warm"); + &shufps ("xmm4","xmm0",0b00010000); + &movdqa ("xmm3","xmm2"); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &pslldq ("xmm3",4); + &xorps ("xmm0","xmm4"); + &pshufd ("xmm1","xmm1",0b01010101); # critical path + &pxor ("xmm2","xmm3"); + &pxor ("xmm0","xmm1"); + &pshufd ("xmm3","xmm0",0b11111111); + &pxor ("xmm2","xmm3"); + &ret(); + +&set_label("key_192b",16); + &movaps ("xmm3","xmm0"); + &shufps ("xmm5","xmm0",0b01000100); + &$movekey (&QWP(0,$key),"xmm5"); + &shufps ("xmm3","xmm2",0b01001110); + &$movekey (&QWP(16,$key),"xmm3"); + &lea ($key,&DWP(32,$key)); + &jmp (&label("key_192b_warm")); + +&set_label("14rounds",16); + &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey + &mov ($rounds,13); + &lea ($key,&DWP(16,$key)); + &$movekey (&QWP(-32,$key),"xmm0"); # round 0 + &$movekey (&QWP(-16,$key),"xmm2"); # round 1 + &aeskeygenassist("xmm1","xmm2",0x01); # round 2 + &call (&label("key_256a_cold")); + &aeskeygenassist("xmm1","xmm0",0x01); # round 3 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x02); # round 4 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x02); # round 5 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x04); # round 6 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x04); # round 7 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x08); # round 8 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x08); # round 9 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x10); # round 10 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x10); # round 11 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x20); # round 12 + &call (&label("key_256a")); + &aeskeygenassist("xmm1","xmm0",0x20); # round 13 + &call (&label("key_256b")); + &aeskeygenassist("xmm1","xmm2",0x40); # round 14 + &call (&label("key_256a")); + &$movekey (&QWP(0,$key),"xmm0"); + &mov (&DWP(16,$key),$rounds); + &xor ("eax","eax"); + &ret(); + +&set_label("key_256a",16); + &$movekey (&QWP(0,$key),"xmm2"); + &lea ($key,&DWP(16,$key)); +&set_label("key_256a_cold"); + &shufps ("xmm4","xmm0",0b00010000); + &xorps ("xmm0","xmm4"); + &shufps ("xmm4","xmm0",0b10001100); + &xorps ("xmm0","xmm4"); + &shufps ("xmm1","xmm1",0b11111111); # critical path + &xorps ("xmm0","xmm1"); + &ret(); + +&set_label("key_256b",16); + &$movekey (&QWP(0,$key),"xmm0"); + &lea ($key,&DWP(16,$key)); + + &shufps ("xmm4","xmm2",0b00010000); + &xorps ("xmm2","xmm4"); + &shufps ("xmm4","xmm2",0b10001100); + &xorps ("xmm2","xmm4"); + &shufps ("xmm1","xmm1",0b10101010); # critical path + &xorps ("xmm2","xmm1"); + &ret(); + +&set_label("bad_pointer",4); + &mov ("eax",-1); + &ret (); +&set_label("bad_keybits",4); + &mov ("eax",-2); + &ret (); +&function_end_B("_aesni_set_encrypt_key"); + +# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_encrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &ret (); +&function_end_B("${PREFIX}_set_encrypt_key"); + +# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits, +# AES_KEY *key) +&function_begin_B("${PREFIX}_set_decrypt_key"); + &mov ("eax",&wparam(0)); + &mov ($rounds,&wparam(1)); + &mov ($key,&wparam(2)); + &call ("_aesni_set_encrypt_key"); + &mov ($key,&wparam(2)); + &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key + &test ("eax","eax"); + &jnz (&label("dec_key_ret")); + &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule + + &$movekey ("xmm0",&QWP(0,$key)); # just swap + &$movekey ("xmm1",&QWP(0,"eax")); + &$movekey (&QWP(0,"eax"),"xmm0"); + &$movekey (&QWP(0,$key),"xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + +&set_label("dec_key_inverse"); + &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse + &$movekey ("xmm1",&QWP(0,"eax")); + &aesimc ("xmm0","xmm0"); + &aesimc ("xmm1","xmm1"); + &lea ($key,&DWP(16,$key)); + &lea ("eax",&DWP(-16,"eax")); + &$movekey (&QWP(16,"eax"),"xmm0"); + &$movekey (&QWP(-16,$key),"xmm1"); + &cmp ("eax",$key); + &ja (&label("dec_key_inverse")); + + &$movekey ("xmm0",&QWP(0,$key)); # inverse middle + &aesimc ("xmm0","xmm0"); + &$movekey (&QWP(0,$key),"xmm0"); + + &xor ("eax","eax"); # return success +&set_label("dec_key_ret"); + &ret (); +&function_end_B("${PREFIX}_set_decrypt_key"); +&asciz("AES for Intel AES-NI, CRYPTOGAMS by "); + +&asm_finish(); diff --git a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl index 49e0f4b351..499f3b3f42 100644 --- a/src/lib/libcrypto/aes/asm/aesni-x86_64.pl +++ b/src/lib/libcrypto/aes/asm/aesni-x86_64.pl @@ -11,6 +11,151 @@ # OpenSSL context it's used with Intel engine, but can also be used as # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for # details]. +# +# Performance. +# +# Given aes(enc|dec) instructions' latency asymptotic performance for +# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte +# processed with 128-bit key. And given their throughput asymptotic +# performance for parallelizable modes is 1.25 cycles per byte. Being +# asymptotic limit it's not something you commonly achieve in reality, +# but how close does one get? Below are results collected for +# different modes and block sized. Pairs of numbers are for en-/ +# decryption. +# +# 16-byte 64-byte 256-byte 1-KB 8-KB +# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26 +# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26 +# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28 +# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07 +# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38 +# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55 +# +# ECB, CTR, CBC and CCM results are free from EVP overhead. This means +# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni +# [-decrypt]' will exhibit 10-15% worse results for smaller blocks. +# The results were collected with specially crafted speed.c benchmark +# in order to compare them with results reported in "Intel Advanced +# Encryption Standard (AES) New Instruction Set" White Paper Revision +# 3.0 dated May 2010. All above results are consistently better. This +# module also provides better performance for block sizes smaller than +# 128 bytes in points *not* represented in the above table. +# +# Looking at the results for 8-KB buffer. +# +# CFB and OFB results are far from the limit, because implementation +# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on +# single-block aesni_encrypt, which is not the most optimal way to go. +# CBC encrypt result is unexpectedly high and there is no documented +# explanation for it. Seemingly there is a small penalty for feeding +# the result back to AES unit the way it's done in CBC mode. There is +# nothing one can do and the result appears optimal. CCM result is +# identical to CBC, because CBC-MAC is essentially CBC encrypt without +# saving output. CCM CTR "stays invisible," because it's neatly +# interleaved wih CBC-MAC. This provides ~30% improvement over +# "straghtforward" CCM implementation with CTR and CBC-MAC performed +# disjointly. Parallelizable modes practically achieve the theoretical +# limit. +# +# Looking at how results vary with buffer size. +# +# Curves are practically saturated at 1-KB buffer size. In most cases +# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one. +# CTR curve doesn't follow this pattern and is "slowest" changing one +# with "256-byte" result being 87% of "8-KB." This is because overhead +# in CTR mode is most computationally intensive. Small-block CCM +# decrypt is slower than encrypt, because first CTR and last CBC-MAC +# iterations can't be interleaved. +# +# Results for 192- and 256-bit keys. +# +# EVP-free results were observed to scale perfectly with number of +# rounds for larger block sizes, i.e. 192-bit result being 10/12 times +# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences +# are a tad smaller, because the above mentioned penalty biases all +# results by same constant value. In similar way function call +# overhead affects small-block performance, as well as OFB and CFB +# results. Differences are not large, most common coefficients are +# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one +# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)... + +# January 2011 +# +# While Westmere processor features 6 cycles latency for aes[enc|dec] +# instructions, which can be scheduled every second cycle, Sandy +# Bridge spends 8 cycles per instruction, but it can schedule them +# every cycle. This means that code targeting Westmere would perform +# suboptimally on Sandy Bridge. Therefore this update. +# +# In addition, non-parallelizable CBC encrypt (as well as CCM) is +# optimized. Relative improvement might appear modest, 8% on Westmere, +# but in absolute terms it's 3.77 cycles per byte encrypted with +# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers +# should be compared to asymptotic limits of 3.75 for Westmere and +# 5.00 for Sandy Bridge. Actually, the fact that they get this close +# to asymptotic limits is quite amazing. Indeed, the limit is +# calculated as latency times number of rounds, 10 for 128-bit key, +# and divided by 16, the number of bytes in block, or in other words +# it accounts *solely* for aesenc instructions. But there are extra +# instructions, and numbers so close to the asymptotic limits mean +# that it's as if it takes as little as *one* additional cycle to +# execute all of them. How is it possible? It is possible thanks to +# out-of-order execution logic, which manages to overlap post- +# processing of previous block, things like saving the output, with +# actual encryption of current block, as well as pre-processing of +# current block, things like fetching input and xor-ing it with +# 0-round element of the key schedule, with actual encryption of +# previous block. Keep this in mind... +# +# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher +# performance is achieved by interleaving instructions working on +# independent blocks. In which case asymptotic limit for such modes +# can be obtained by dividing above mentioned numbers by AES +# instructions' interleave factor. Westmere can execute at most 3 +# instructions at a time, meaning that optimal interleave factor is 3, +# and that's where the "magic" number of 1.25 come from. "Optimal +# interleave factor" means that increase of interleave factor does +# not improve performance. The formula has proven to reflect reality +# pretty well on Westmere... Sandy Bridge on the other hand can +# execute up to 8 AES instructions at a time, so how does varying +# interleave factor affect the performance? Here is table for ECB +# (numbers are cycles per byte processed with 128-bit key): +# +# instruction interleave factor 3x 6x 8x +# theoretical asymptotic limit 1.67 0.83 0.625 +# measured performance for 8KB block 1.05 0.86 0.84 +# +# "as if" interleave factor 4.7x 5.8x 6.0x +# +# Further data for other parallelizable modes: +# +# CBC decrypt 1.16 0.93 0.93 +# CTR 1.14 0.91 n/a +# +# Well, given 3x column it's probably inappropriate to call the limit +# asymptotic, if it can be surpassed, isn't it? What happens there? +# Rewind to CBC paragraph for the answer. Yes, out-of-order execution +# magic is responsible for this. Processor overlaps not only the +# additional instructions with AES ones, but even AES instuctions +# processing adjacent triplets of independent blocks. In the 6x case +# additional instructions still claim disproportionally small amount +# of additional cycles, but in 8x case number of instructions must be +# a tad too high for out-of-order logic to cope with, and AES unit +# remains underutilized... As you can see 8x interleave is hardly +# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl +# utilizies 6x interleave because of limited register bank capacity. +# +# Higher interleave factors do have negative impact on Westmere +# performance. While for ECB mode it's negligible ~1.5%, other +# parallelizables perform ~5% worse, which is outweighed by ~25% +# improvement on Sandy Bridge. To balance regression on Westmere +# CTR mode was implemented with 6x aesenc interleave factor. + +# April 2011 +# +# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing +# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like +# in CTR mode AES instruction interleave factor was chosen to be 6x. $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for @@ -29,7 +174,7 @@ die "can't locate x86_64-xlate.pl"; open STDOUT,"| $^X $xlate $flavour $output"; -$movkey = $PREFIX eq "aesni" ? "movaps" : "movups"; +$movkey = $PREFIX eq "aesni" ? "movups" : "movups"; @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order @@ -41,18 +186,20 @@ $inp="%rdi"; $out="%rsi"; $len="%rdx"; $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!! -$ivp="%r8"; # cbc +$ivp="%r8"; # cbc, ctr, ... $rnds_="%r10d"; # backup copy for $rounds $key_="%r11"; # backup copy for $key # %xmm register layout -$inout0="%xmm0"; $inout1="%xmm1"; -$inout2="%xmm2"; $inout3="%xmm3"; -$rndkey0="%xmm4"; $rndkey1="%xmm5"; - -$iv="%xmm6"; $in0="%xmm7"; # used in CBC decrypt -$in1="%xmm8"; $in2="%xmm9"; +$rndkey0="%xmm0"; $rndkey1="%xmm1"; +$inout0="%xmm2"; $inout1="%xmm3"; +$inout2="%xmm4"; $inout3="%xmm5"; +$inout4="%xmm6"; $inout5="%xmm7"; +$inout6="%xmm8"; $inout7="%xmm9"; + +$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ... +$in0="%xmm8"; $iv="%xmm9"; # Inline version of internal aesni_[en|de]crypt1. # @@ -60,20 +207,29 @@ $in1="%xmm8"; $in2="%xmm9"; # cycles which take care of loop variables... { my $sn; sub aesni_generate1 { -my ($p,$key,$rounds)=@_; +my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout)); ++$sn; $code.=<<___; $movkey ($key),$rndkey0 $movkey 16($key),$rndkey1 +___ +$code.=<<___ if (defined($ivec)); + xorps $rndkey0,$ivec + lea 32($key),$key + xorps $ivec,$inout +___ +$code.=<<___ if (!defined($ivec)); lea 32($key),$key - pxor $rndkey0,$inout0 + xorps $rndkey0,$inout +___ +$code.=<<___; .Loop_${p}1_$sn: - aes${p} $rndkey1,$inout0 + aes${p} $rndkey1,$inout dec $rounds $movkey ($key),$rndkey1 lea 16($key),$key jnz .Loop_${p}1_$sn # loop body is 16 bytes - aes${p}last $rndkey1,$inout0 + aes${p}last $rndkey1,$inout ___ }} # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key); @@ -86,7 +242,7 @@ $code.=<<___; .align 16 ${PREFIX}_encrypt: movups ($inp),$inout0 # load input - mov 240($key),$rounds # pull $rounds + mov 240($key),$rounds # key->rounds ___ &aesni_generate1("enc",$key,$rounds); $code.=<<___; @@ -99,7 +255,7 @@ $code.=<<___; .align 16 ${PREFIX}_decrypt: movups ($inp),$inout0 # load input - mov 240($key),$rounds # pull $rounds + mov 240($key),$rounds # key->rounds ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; @@ -109,16 +265,16 @@ $code.=<<___; ___ } -# _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave -# factor. Why 3x subroutine is used in loops? Even though aes[enc|dec] -# latency is 6, it turned out that it can be scheduled only every -# *second* cycle. Thus 3x interleave is the one providing optimal +# _aesni_[en|de]cryptN are private interfaces, N denotes interleave +# factor. Why 3x subroutine were originally used in loops? Even though +# aes[enc|dec] latency was originally 6, it could be scheduled only +# every *2nd* cycle. Thus 3x interleave was the one providing optimal # utilization, i.e. when subroutine's throughput is virtually same as # of non-interleaved subroutine [for number of input blocks up to 3]. -# This is why it makes no sense to implement 2x subroutine. As soon -# as/if Intel improves throughput by making it possible to schedule -# the instructions in question *every* cycles I would have to -# implement 6x interleave and use it in loop... +# This is why it makes no sense to implement 2x subroutine. +# aes[enc|dec] latency in next processor generation is 8, but the +# instructions can be scheduled every cycle. Optimal interleave for +# new processor is therefore 8x... sub aesni_generate3 { my $dir=shift; # As already mentioned it takes in $key and $rounds, which are *not* @@ -131,25 +287,25 @@ _aesni_${dir}rypt3: shr \$1,$rounds $movkey 16($key),$rndkey1 lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 + $movkey ($key),$rndkey0 .L${dir}_loop3: aes${dir} $rndkey1,$inout0 - $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout1 dec $rounds aes${dir} $rndkey1,$inout2 - aes${dir} $rndkey0,$inout0 $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 lea 32($key),$key aes${dir} $rndkey0,$inout2 + $movkey ($key),$rndkey0 jnz .L${dir}_loop3 aes${dir} $rndkey1,$inout0 - $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir}last $rndkey0,$inout0 @@ -175,28 +331,28 @@ _aesni_${dir}rypt4: shr \$1,$rounds $movkey 16($key),$rndkey1 lea 32($key),$key - pxor $rndkey0,$inout0 - pxor $rndkey0,$inout1 - pxor $rndkey0,$inout2 - pxor $rndkey0,$inout3 + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + xorps $rndkey0,$inout2 + xorps $rndkey0,$inout3 + $movkey ($key),$rndkey0 .L${dir}_loop4: aes${dir} $rndkey1,$inout0 - $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout1 dec $rounds aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 - aes${dir} $rndkey0,$inout0 $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout0 aes${dir} $rndkey0,$inout1 lea 32($key),$key aes${dir} $rndkey0,$inout2 aes${dir} $rndkey0,$inout3 + $movkey ($key),$rndkey0 jnz .L${dir}_loop4 aes${dir} $rndkey1,$inout0 - $movkey ($key),$rndkey0 aes${dir} $rndkey1,$inout1 aes${dir} $rndkey1,$inout2 aes${dir} $rndkey1,$inout3 @@ -208,12 +364,158 @@ _aesni_${dir}rypt4: .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4 ___ } +sub aesni_generate6 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-5] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt6,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt6: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + xorps $rndkey0,$inout0 + pxor $rndkey0,$inout1 + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aes${dir} $rndkey1,$inout4 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout5 + jmp .L${dir}_loop6_enter +.align 16 +.L${dir}_loop6: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 +.L${dir}_loop6_enter: # happens to be 16-byte aligned + $movkey 16($key),$rndkey1 + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .L${dir}_loop6 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + ret +.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6 +___ +} +sub aesni_generate8 { +my $dir=shift; +# As already mentioned it takes in $key and $rounds, which are *not* +# preserved. $inout[0-7] is cipher/clear text... +$code.=<<___; +.type _aesni_${dir}rypt8,\@abi-omnipotent +.align 16 +_aesni_${dir}rypt8: + $movkey ($key),$rndkey0 + shr \$1,$rounds + $movkey 16($key),$rndkey1 + lea 32($key),$key + xorps $rndkey0,$inout0 + xorps $rndkey0,$inout1 + aes${dir} $rndkey1,$inout0 + pxor $rndkey0,$inout2 + aes${dir} $rndkey1,$inout1 + pxor $rndkey0,$inout3 + aes${dir} $rndkey1,$inout2 + pxor $rndkey0,$inout4 + aes${dir} $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aes${dir} $rndkey1,$inout4 + pxor $rndkey0,$inout6 + aes${dir} $rndkey1,$inout5 + pxor $rndkey0,$inout7 + $movkey ($key),$rndkey0 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + $movkey 16($key),$rndkey1 + jmp .L${dir}_loop8_enter +.align 16 +.L${dir}_loop8: + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + dec $rounds + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + $movkey 16($key),$rndkey1 +.L${dir}_loop8_enter: # happens to be 16-byte aligned + aes${dir} $rndkey0,$inout0 + aes${dir} $rndkey0,$inout1 + lea 32($key),$key + aes${dir} $rndkey0,$inout2 + aes${dir} $rndkey0,$inout3 + aes${dir} $rndkey0,$inout4 + aes${dir} $rndkey0,$inout5 + aes${dir} $rndkey0,$inout6 + aes${dir} $rndkey0,$inout7 + $movkey ($key),$rndkey0 + jnz .L${dir}_loop8 + + aes${dir} $rndkey1,$inout0 + aes${dir} $rndkey1,$inout1 + aes${dir} $rndkey1,$inout2 + aes${dir} $rndkey1,$inout3 + aes${dir} $rndkey1,$inout4 + aes${dir} $rndkey1,$inout5 + aes${dir} $rndkey1,$inout6 + aes${dir} $rndkey1,$inout7 + aes${dir}last $rndkey0,$inout0 + aes${dir}last $rndkey0,$inout1 + aes${dir}last $rndkey0,$inout2 + aes${dir}last $rndkey0,$inout3 + aes${dir}last $rndkey0,$inout4 + aes${dir}last $rndkey0,$inout5 + aes${dir}last $rndkey0,$inout6 + aes${dir}last $rndkey0,$inout7 + ret +.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8 +___ +} &aesni_generate3("enc") if ($PREFIX eq "aesni"); &aesni_generate3("dec"); &aesni_generate4("enc") if ($PREFIX eq "aesni"); &aesni_generate4("dec"); +&aesni_generate6("enc") if ($PREFIX eq "aesni"); +&aesni_generate6("dec"); +&aesni_generate8("enc") if ($PREFIX eq "aesni"); +&aesni_generate8("dec"); if ($PREFIX eq "aesni") { +######################################################################## # void aesni_ecb_encrypt (const void *in, void *out, # size_t length, const AES_KEY *key, # int enc); @@ -222,54 +524,98 @@ $code.=<<___; .type aesni_ecb_encrypt,\@function,5 .align 16 aesni_ecb_encrypt: - cmp \$16,$len # check length - jb .Lecb_ret - - mov 240($key),$rounds # pull $rounds and \$-16,$len + jz .Lecb_ret + + mov 240($key),$rounds # key->rounds + $movkey ($key),$rndkey0 mov $key,$key_ # backup $key - test %r8d,%r8d # 5th argument mov $rounds,$rnds_ # backup $rounds + test %r8d,%r8d # 5th argument jz .Lecb_decrypt #--------------------------- ECB ENCRYPT ------------------------------# - sub \$0x40,$len - jbe .Lecb_enc_tail - jmp .Lecb_enc_loop3 + cmp \$0x80,$len + jb .Lecb_enc_tail + + movdqu ($inp),$inout0 + movdqu 0x10($inp),$inout1 + movdqu 0x20($inp),$inout2 + movdqu 0x30($inp),$inout3 + movdqu 0x40($inp),$inout4 + movdqu 0x50($inp),$inout5 + movdqu 0x60($inp),$inout6 + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp + sub \$0x80,$len + jmp .Lecb_enc_loop8_enter .align 16 -.Lecb_enc_loop3: - movups ($inp),$inout0 - movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - call _aesni_encrypt3 - sub \$0x30,$len - lea 0x30($inp),$inp - lea 0x30($out),$out - movups $inout0,-0x30($out) - mov $rnds_,$rounds # restore $rounds - movups $inout1,-0x20($out) +.Lecb_enc_loop8: + movups $inout0,($out) mov $key_,$key # restore $key - movups $inout2,-0x10($out) - ja .Lecb_enc_loop3 + movdqu ($inp),$inout0 + mov $rnds_,$rounds # restore $rounds + movups $inout1,0x10($out) + movdqu 0x10($inp),$inout1 + movups $inout2,0x20($out) + movdqu 0x20($inp),$inout2 + movups $inout3,0x30($out) + movdqu 0x30($inp),$inout3 + movups $inout4,0x40($out) + movdqu 0x40($inp),$inout4 + movups $inout5,0x50($out) + movdqu 0x50($inp),$inout5 + movups $inout6,0x60($out) + movdqu 0x60($inp),$inout6 + movups $inout7,0x70($out) + lea 0x80($out),$out + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp +.Lecb_enc_loop8_enter: + + call _aesni_encrypt8 + + sub \$0x80,$len + jnc .Lecb_enc_loop8 -.Lecb_enc_tail: - add \$0x40,$len + movups $inout0,($out) + mov $key_,$key # restore $key + movups $inout1,0x10($out) + mov $rnds_,$rounds # restore $rounds + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + add \$0x80,$len jz .Lecb_ret - cmp \$0x10,$len +.Lecb_enc_tail: movups ($inp),$inout0 - je .Lecb_enc_one cmp \$0x20,$len + jb .Lecb_enc_one movups 0x10($inp),$inout1 je .Lecb_enc_two - cmp \$0x30,$len movups 0x20($inp),$inout2 - je .Lecb_enc_three + cmp \$0x40,$len + jb .Lecb_enc_three movups 0x30($inp),$inout3 - call _aesni_encrypt4 + je .Lecb_enc_four + movups 0x40($inp),$inout4 + cmp \$0x60,$len + jb .Lecb_enc_five + movups 0x50($inp),$inout5 + je .Lecb_enc_six + movdqu 0x60($inp),$inout6 + call _aesni_encrypt8 movups $inout0,($out) movups $inout1,0x10($out) movups $inout2,0x20($out) movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) jmp .Lecb_ret .align 16 .Lecb_enc_one: @@ -280,6 +626,7 @@ $code.=<<___; jmp .Lecb_ret .align 16 .Lecb_enc_two: + xorps $inout2,$inout2 call _aesni_encrypt3 movups $inout0,($out) movups $inout1,0x10($out) @@ -291,42 +638,145 @@ $code.=<<___; movups $inout1,0x10($out) movups $inout2,0x20($out) jmp .Lecb_ret +.align 16 +.Lecb_enc_four: + call _aesni_encrypt4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_five: + xorps $inout5,$inout5 + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_enc_six: + call _aesni_encrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + jmp .Lecb_ret #--------------------------- ECB DECRYPT ------------------------------# .align 16 .Lecb_decrypt: - sub \$0x40,$len - jbe .Lecb_dec_tail - jmp .Lecb_dec_loop3 + cmp \$0x80,$len + jb .Lecb_dec_tail + + movdqu ($inp),$inout0 + movdqu 0x10($inp),$inout1 + movdqu 0x20($inp),$inout2 + movdqu 0x30($inp),$inout3 + movdqu 0x40($inp),$inout4 + movdqu 0x50($inp),$inout5 + movdqu 0x60($inp),$inout6 + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp + sub \$0x80,$len + jmp .Lecb_dec_loop8_enter .align 16 -.Lecb_dec_loop3: - movups ($inp),$inout0 - movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - call _aesni_decrypt3 - sub \$0x30,$len - lea 0x30($inp),$inp - lea 0x30($out),$out - movups $inout0,-0x30($out) - mov $rnds_,$rounds # restore $rounds - movups $inout1,-0x20($out) +.Lecb_dec_loop8: + movups $inout0,($out) mov $key_,$key # restore $key - movups $inout2,-0x10($out) - ja .Lecb_dec_loop3 + movdqu ($inp),$inout0 + mov $rnds_,$rounds # restore $rounds + movups $inout1,0x10($out) + movdqu 0x10($inp),$inout1 + movups $inout2,0x20($out) + movdqu 0x20($inp),$inout2 + movups $inout3,0x30($out) + movdqu 0x30($inp),$inout3 + movups $inout4,0x40($out) + movdqu 0x40($inp),$inout4 + movups $inout5,0x50($out) + movdqu 0x50($inp),$inout5 + movups $inout6,0x60($out) + movdqu 0x60($inp),$inout6 + movups $inout7,0x70($out) + lea 0x80($out),$out + movdqu 0x70($inp),$inout7 + lea 0x80($inp),$inp +.Lecb_dec_loop8_enter: + + call _aesni_decrypt8 + + $movkey ($key_),$rndkey0 + sub \$0x80,$len + jnc .Lecb_dec_loop8 -.Lecb_dec_tail: - add \$0x40,$len + movups $inout0,($out) + mov $key_,$key # restore $key + movups $inout1,0x10($out) + mov $rnds_,$rounds # restore $rounds + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + movups $inout7,0x70($out) + lea 0x80($out),$out + add \$0x80,$len jz .Lecb_ret - cmp \$0x10,$len +.Lecb_dec_tail: movups ($inp),$inout0 - je .Lecb_dec_one cmp \$0x20,$len + jb .Lecb_dec_one movups 0x10($inp),$inout1 je .Lecb_dec_two - cmp \$0x30,$len movups 0x20($inp),$inout2 - je .Lecb_dec_three + cmp \$0x40,$len + jb .Lecb_dec_three movups 0x30($inp),$inout3 + je .Lecb_dec_four + movups 0x40($inp),$inout4 + cmp \$0x60,$len + jb .Lecb_dec_five + movups 0x50($inp),$inout5 + je .Lecb_dec_six + movups 0x60($inp),$inout6 + $movkey ($key),$rndkey0 + call _aesni_decrypt8 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + movups $inout6,0x60($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_one: +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + movups $inout0,($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_two: + xorps $inout2,$inout2 + call _aesni_decrypt3 + movups $inout0,($out) + movups $inout1,0x10($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_three: + call _aesni_decrypt3 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_four: call _aesni_decrypt4 movups $inout0,($out) movups $inout1,0x10($out) @@ -334,35 +784,1343 @@ $code.=<<___; movups $inout3,0x30($out) jmp .Lecb_ret .align 16 -.Lecb_dec_one: +.Lecb_dec_five: + xorps $inout5,$inout5 + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + jmp .Lecb_ret +.align 16 +.Lecb_dec_six: + call _aesni_decrypt6 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + +.Lecb_ret: + ret +.size aesni_ecb_encrypt,.-aesni_ecb_encrypt +___ + +{ +###################################################################### +# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec,char *cmac); +# +# Handles only complete blocks, operates on 64-bit counter and +# does not update *ivec! Nor does it finalize CMAC value +# (see engine/eng_aesni.c for details) +# +{ +my $cmac="%r9"; # 6th argument + +my $increment="%xmm6"; +my $bswap_mask="%xmm7"; + +$code.=<<___; +.globl aesni_ccm64_encrypt_blocks +.type aesni_ccm64_encrypt_blocks,\@function,6 +.align 16 +aesni_ccm64_encrypt_blocks: +___ +$code.=<<___ if ($win64); + lea -0x58(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) +.Lccm64_enc_body: +___ +$code.=<<___; + mov 240($key),$rounds # key->rounds + movdqu ($ivp),$iv + movdqa .Lincrement64(%rip),$increment + movdqa .Lbswap_mask(%rip),$bswap_mask + + shr \$1,$rounds + lea 0($key),$key_ + movdqu ($cmac),$inout1 + movdqa $iv,$inout0 + mov $rounds,$rnds_ + pshufb $bswap_mask,$iv + jmp .Lccm64_enc_outer +.align 16 +.Lccm64_enc_outer: + $movkey ($key_),$rndkey0 + mov $rnds_,$rounds + movups ($inp),$in0 # load inp + + xorps $rndkey0,$inout0 # counter + $movkey 16($key_),$rndkey1 + xorps $in0,$rndkey0 + lea 32($key_),$key + xorps $rndkey0,$inout1 # cmac^=inp + $movkey ($key),$rndkey0 + +.Lccm64_enc2_loop: + aesenc $rndkey1,$inout0 + dec $rounds + aesenc $rndkey1,$inout1 + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + lea 32($key),$key + aesenc $rndkey0,$inout1 + $movkey 0($key),$rndkey0 + jnz .Lccm64_enc2_loop + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + paddq $increment,$iv + aesenclast $rndkey0,$inout0 + aesenclast $rndkey0,$inout1 + + dec $len + lea 16($inp),$inp + xorps $inout0,$in0 # inp ^= E(iv) + movdqa $iv,$inout0 + movups $in0,($out) # save output + lea 16($out),$out + pshufb $bswap_mask,$inout0 + jnz .Lccm64_enc_outer + + movups $inout1,($cmac) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + lea 0x58(%rsp),%rsp +.Lccm64_enc_ret: +___ +$code.=<<___; + ret +.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks +___ +###################################################################### +$code.=<<___; +.globl aesni_ccm64_decrypt_blocks +.type aesni_ccm64_decrypt_blocks,\@function,6 +.align 16 +aesni_ccm64_decrypt_blocks: +___ +$code.=<<___ if ($win64); + lea -0x58(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + movaps %xmm8,0x20(%rsp) + movaps %xmm9,0x30(%rsp) +.Lccm64_dec_body: +___ +$code.=<<___; + mov 240($key),$rounds # key->rounds + movups ($ivp),$iv + movdqu ($cmac),$inout1 + movdqa .Lincrement64(%rip),$increment + movdqa .Lbswap_mask(%rip),$bswap_mask + + movaps $iv,$inout0 + mov $rounds,$rnds_ + mov $key,$key_ + pshufb $bswap_mask,$iv +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + movups ($inp),$in0 # load inp + paddq $increment,$iv + lea 16($inp),$inp + jmp .Lccm64_dec_outer +.align 16 +.Lccm64_dec_outer: + xorps $inout0,$in0 # inp ^= E(iv) + movdqa $iv,$inout0 + mov $rnds_,$rounds + movups $in0,($out) # save output + lea 16($out),$out + pshufb $bswap_mask,$inout0 + + sub \$1,$len + jz .Lccm64_dec_break + + $movkey ($key_),$rndkey0 + shr \$1,$rounds + $movkey 16($key_),$rndkey1 + xorps $rndkey0,$in0 + lea 32($key_),$key + xorps $rndkey0,$inout0 + xorps $in0,$inout1 # cmac^=out + $movkey ($key),$rndkey0 + +.Lccm64_dec2_loop: + aesenc $rndkey1,$inout0 + dec $rounds + aesenc $rndkey1,$inout1 + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + lea 32($key),$key + aesenc $rndkey0,$inout1 + $movkey 0($key),$rndkey0 + jnz .Lccm64_dec2_loop + movups ($inp),$in0 # load inp + paddq $increment,$iv + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + lea 16($inp),$inp + aesenclast $rndkey0,$inout0 + aesenclast $rndkey0,$inout1 + jmp .Lccm64_dec_outer + +.align 16 +.Lccm64_dec_break: + #xorps $in0,$inout1 # cmac^=out +___ + &aesni_generate1("enc",$key_,$rounds,$inout1,$in0); +$code.=<<___; + movups $inout1,($cmac) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + lea 0x58(%rsp),%rsp +.Lccm64_dec_ret: +___ +$code.=<<___; + ret +.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks +___ +} +###################################################################### +# void aesni_ctr32_encrypt_blocks (const void *in, void *out, +# size_t blocks, const AES_KEY *key, +# const char *ivec); +# +# Handles only complete blocks, operates on 32-bit counter and +# does not update *ivec! (see engine/eng_aesni.c for details) +# +{ +my $reserved = $win64?0:-0x28; +my ($in0,$in1,$in2,$in3)=map("%xmm$_",(8..11)); +my ($iv0,$iv1,$ivec)=("%xmm12","%xmm13","%xmm14"); +my $bswap_mask="%xmm15"; + +$code.=<<___; +.globl aesni_ctr32_encrypt_blocks +.type aesni_ctr32_encrypt_blocks,\@function,5 +.align 16 +aesni_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($win64); + lea -0xc8(%rsp),%rsp + movaps %xmm6,0x20(%rsp) + movaps %xmm7,0x30(%rsp) + movaps %xmm8,0x40(%rsp) + movaps %xmm9,0x50(%rsp) + movaps %xmm10,0x60(%rsp) + movaps %xmm11,0x70(%rsp) + movaps %xmm12,0x80(%rsp) + movaps %xmm13,0x90(%rsp) + movaps %xmm14,0xa0(%rsp) + movaps %xmm15,0xb0(%rsp) +.Lctr32_body: +___ +$code.=<<___; + cmp \$1,$len + je .Lctr32_one_shortcut + + movdqu ($ivp),$ivec + movdqa .Lbswap_mask(%rip),$bswap_mask + xor $rounds,$rounds + pextrd \$3,$ivec,$rnds_ # pull 32-bit counter + pinsrd \$3,$rounds,$ivec # wipe 32-bit counter + + mov 240($key),$rounds # key->rounds + bswap $rnds_ + pxor $iv0,$iv0 # vector of 3 32-bit counters + pxor $iv1,$iv1 # vector of 3 32-bit counters + pinsrd \$0,$rnds_,$iv0 + lea 3($rnds_),$key_ + pinsrd \$0,$key_,$iv1 + inc $rnds_ + pinsrd \$1,$rnds_,$iv0 + inc $key_ + pinsrd \$1,$key_,$iv1 + inc $rnds_ + pinsrd \$2,$rnds_,$iv0 + inc $key_ + pinsrd \$2,$key_,$iv1 + movdqa $iv0,$reserved(%rsp) + pshufb $bswap_mask,$iv0 + movdqa $iv1,`$reserved+0x10`(%rsp) + pshufb $bswap_mask,$iv1 + + pshufd \$`3<<6`,$iv0,$inout0 # place counter to upper dword + pshufd \$`2<<6`,$iv0,$inout1 + pshufd \$`1<<6`,$iv0,$inout2 + cmp \$6,$len + jb .Lctr32_tail + shr \$1,$rounds + mov $key,$key_ # backup $key + mov $rounds,$rnds_ # backup $rounds + sub \$6,$len + jmp .Lctr32_loop6 + +.align 16 +.Lctr32_loop6: + pshufd \$`3<<6`,$iv1,$inout3 + por $ivec,$inout0 # merge counter-less ivec + $movkey ($key_),$rndkey0 + pshufd \$`2<<6`,$iv1,$inout4 + por $ivec,$inout1 + $movkey 16($key_),$rndkey1 + pshufd \$`1<<6`,$iv1,$inout5 + por $ivec,$inout2 + por $ivec,$inout3 + xorps $rndkey0,$inout0 + por $ivec,$inout4 + por $ivec,$inout5 + + # inline _aesni_encrypt6 and interleave last rounds + # with own code... + + pxor $rndkey0,$inout1 + aesenc $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + aesenc $rndkey1,$inout1 + movdqa .Lincrement32(%rip),$iv1 + pxor $rndkey0,$inout3 + aesenc $rndkey1,$inout2 + movdqa $reserved(%rsp),$iv0 + pxor $rndkey0,$inout4 + aesenc $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + jmp .Lctr32_enc_loop6_enter +.align 16 +.Lctr32_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + dec $rounds + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 +.Lctr32_enc_loop6_enter: + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + lea 32($key),$key + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lctr32_enc_loop6 + + aesenc $rndkey1,$inout0 + paddd $iv1,$iv0 # increment counter vector + aesenc $rndkey1,$inout1 + paddd `$reserved+0x10`(%rsp),$iv1 + aesenc $rndkey1,$inout2 + movdqa $iv0,$reserved(%rsp) # save counter vector + aesenc $rndkey1,$inout3 + movdqa $iv1,`$reserved+0x10`(%rsp) + aesenc $rndkey1,$inout4 + pshufb $bswap_mask,$iv0 # byte swap + aesenc $rndkey1,$inout5 + pshufb $bswap_mask,$iv1 + + aesenclast $rndkey0,$inout0 + movups ($inp),$in0 # load input + aesenclast $rndkey0,$inout1 + movups 0x10($inp),$in1 + aesenclast $rndkey0,$inout2 + movups 0x20($inp),$in2 + aesenclast $rndkey0,$inout3 + movups 0x30($inp),$in3 + aesenclast $rndkey0,$inout4 + movups 0x40($inp),$rndkey1 + aesenclast $rndkey0,$inout5 + movups 0x50($inp),$rndkey0 + lea 0x60($inp),$inp + + xorps $inout0,$in0 # xor + pshufd \$`3<<6`,$iv0,$inout0 + xorps $inout1,$in1 + pshufd \$`2<<6`,$iv0,$inout1 + movups $in0,($out) # store output + xorps $inout2,$in2 + pshufd \$`1<<6`,$iv0,$inout2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + xorps $inout4,$rndkey1 + movups $in3,0x30($out) + xorps $inout5,$rndkey0 + movups $rndkey1,0x40($out) + movups $rndkey0,0x50($out) + lea 0x60($out),$out + mov $rnds_,$rounds + sub \$6,$len + jnc .Lctr32_loop6 + + add \$6,$len + jz .Lctr32_done + mov $key_,$key # restore $key + lea 1($rounds,$rounds),$rounds # restore original value + +.Lctr32_tail: + por $ivec,$inout0 + movups ($inp),$in0 + cmp \$2,$len + jb .Lctr32_one + + por $ivec,$inout1 + movups 0x10($inp),$in1 + je .Lctr32_two + + pshufd \$`3<<6`,$iv1,$inout3 + por $ivec,$inout2 + movups 0x20($inp),$in2 + cmp \$4,$len + jb .Lctr32_three + + pshufd \$`2<<6`,$iv1,$inout4 + por $ivec,$inout3 + movups 0x30($inp),$in3 + je .Lctr32_four + + por $ivec,$inout4 + xorps $inout5,$inout5 + + call _aesni_encrypt6 + + movups 0x40($inp),$rndkey1 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + xorps $inout4,$rndkey1 + movups $in3,0x30($out) + movups $rndkey1,0x40($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_one_shortcut: + movups ($ivp),$inout0 + movups ($inp),$in0 + mov 240($key),$rounds # key->rounds +.Lctr32_one: +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps $inout0,$in0 + movups $in0,($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_two: + xorps $inout2,$inout2 + call _aesni_encrypt3 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + movups $in1,0x10($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_three: + call _aesni_encrypt3 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + movups $in2,0x20($out) + jmp .Lctr32_done + +.align 16 +.Lctr32_four: + call _aesni_encrypt4 + xorps $inout0,$in0 + xorps $inout1,$in1 + movups $in0,($out) + xorps $inout2,$in2 + movups $in1,0x10($out) + xorps $inout3,$in3 + movups $in2,0x20($out) + movups $in3,0x30($out) + +.Lctr32_done: +___ +$code.=<<___ if ($win64); + movaps 0x20(%rsp),%xmm6 + movaps 0x30(%rsp),%xmm7 + movaps 0x40(%rsp),%xmm8 + movaps 0x50(%rsp),%xmm9 + movaps 0x60(%rsp),%xmm10 + movaps 0x70(%rsp),%xmm11 + movaps 0x80(%rsp),%xmm12 + movaps 0x90(%rsp),%xmm13 + movaps 0xa0(%rsp),%xmm14 + movaps 0xb0(%rsp),%xmm15 + lea 0xc8(%rsp),%rsp +.Lctr32_ret: +___ +$code.=<<___; + ret +.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks +___ +} + +###################################################################### +# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2 +# const unsigned char iv[16]); +# +{ +my @tweak=map("%xmm$_",(10..15)); +my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]); +my ($key2,$ivp,$len_)=("%r8","%r9","%r9"); +my $frame_size = 0x68 + ($win64?160:0); + +$code.=<<___; +.globl aesni_xts_encrypt +.type aesni_xts_encrypt,\@function,6 +.align 16 +aesni_xts_encrypt: + lea -$frame_size(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x60(%rsp) + movaps %xmm7,0x70(%rsp) + movaps %xmm8,0x80(%rsp) + movaps %xmm9,0x90(%rsp) + movaps %xmm10,0xa0(%rsp) + movaps %xmm11,0xb0(%rsp) + movaps %xmm12,0xc0(%rsp) + movaps %xmm13,0xd0(%rsp) + movaps %xmm14,0xe0(%rsp) + movaps %xmm15,0xf0(%rsp) +.Lxts_enc_body: +___ +$code.=<<___; + movups ($ivp),@tweak[5] # load clear-text tweak + mov 240(%r8),$rounds # key2->rounds + mov 240($key),$rnds_ # key1->rounds +___ + # generate the tweak + &aesni_generate1("enc",$key2,$rounds,@tweak[5]); +$code.=<<___; + mov $key,$key_ # backup $key + mov $rnds_,$rounds # backup $rounds + mov $len,$len_ # backup $len + and \$-16,$len + + movdqa .Lxts_magic(%rip),$twmask + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp # broadcast upper bits +___ + for ($i=0;$i<4;$i++) { + $code.=<<___; + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[$i] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,$twres # isolate carry and residue + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] +___ + } +$code.=<<___; + sub \$16*6,$len + jc .Lxts_enc_short + + shr \$1,$rounds + sub \$1,$rounds + mov $rounds,$rnds_ + jmp .Lxts_enc_grandloop + +.align 16 +.Lxts_enc_grandloop: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu `16*0`($inp),$inout0 # load input + pand $twmask,$twres # isolate carry and residue + movdqu `16*1`($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu `16*2`($inp),$inout2 + pxor @tweak[0],$inout0 # input^=tweak + movdqu `16*3`($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu `16*4`($inp),$inout4 + pxor @tweak[2],$inout2 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + pxor @tweak[3],$inout3 + $movkey ($key_),$rndkey0 + pxor @tweak[4],$inout4 + pxor @tweak[5],$inout5 + + # inline _aesni_encrypt6 and interleave first and last rounds + # with own code... + $movkey 16($key_),$rndkey1 + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks + aesenc $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + movdqa @tweak[1],`16*1`(%rsp) + aesenc $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqa @tweak[2],`16*2`(%rsp) + aesenc $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqa @tweak[3],`16*3`(%rsp) + aesenc $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + movdqa @tweak[4],`16*4`(%rsp) + aesenc $rndkey1,$inout4 + movdqa @tweak[5],`16*5`(%rsp) + aesenc $rndkey1,$inout5 + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp + jmp .Lxts_enc_loop6_enter + +.align 16 +.Lxts_enc_loop6: + aesenc $rndkey1,$inout0 + aesenc $rndkey1,$inout1 + dec $rounds + aesenc $rndkey1,$inout2 + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 +.Lxts_enc_loop6_enter: + $movkey 16($key),$rndkey1 + aesenc $rndkey0,$inout0 + aesenc $rndkey0,$inout1 + lea 32($key),$key + aesenc $rndkey0,$inout2 + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lxts_enc_loop6 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcast upper bits + aesenc $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + $movkey 16($key),$rndkey1 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenc $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey0,$inout3 + aesenc $rndkey0,$inout4 + aesenc $rndkey0,$inout5 + $movkey 32($key),$rndkey0 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[1] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenc $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenc $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenc $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesenc $rndkey1,$inout3 + aesenc $rndkey1,$inout4 + aesenc $rndkey1,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[2] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesenclast $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesenclast $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesenclast $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesenclast $rndkey0,$inout3 + aesenclast $rndkey0,$inout4 + aesenclast $rndkey0,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[3] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + xorps `16*0`(%rsp),$inout0 # output^=tweak + pand $twmask,$twres # isolate carry and residue + xorps `16*1`(%rsp),$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] + + xorps `16*2`(%rsp),$inout2 + movups $inout0,`16*0`($out) # write output + xorps `16*3`(%rsp),$inout3 + movups $inout1,`16*1`($out) + xorps `16*4`(%rsp),$inout4 + movups $inout2,`16*2`($out) + xorps `16*5`(%rsp),$inout5 + movups $inout3,`16*3`($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$16*6,$len + jnc .Lxts_enc_grandloop + + lea 3($rounds,$rounds),$rounds # restore original value + mov $key_,$key # restore $key + mov $rounds,$rnds_ # backup $rounds + +.Lxts_enc_short: + add \$16*6,$len + jz .Lxts_enc_done + + cmp \$0x20,$len + jb .Lxts_enc_one + je .Lxts_enc_two + + cmp \$0x40,$len + jb .Lxts_enc_three + je .Lxts_enc_four + + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movdqu 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu 16*2($inp),$inout2 + pxor @tweak[0],$inout0 + movdqu 16*3($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu 16*4($inp),$inout4 + lea 16*5($inp),$inp + pxor @tweak[2],$inout2 + pxor @tweak[3],$inout3 + pxor @tweak[4],$inout4 + + call _aesni_encrypt6 + + xorps @tweak[0],$inout0 + movdqa @tweak[5],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movdqu $inout0,($out) + xorps @tweak[3],$inout3 + movdqu $inout1,16*1($out) + xorps @tweak[4],$inout4 + movdqu $inout2,16*2($out) + movdqu $inout3,16*3($out) + movdqu $inout4,16*4($out) + lea 16*5($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_one: + movups ($inp),$inout0 + lea 16*1($inp),$inp + xorps @tweak[0],$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movdqa @tweak[1],@tweak[0] + movups $inout0,($out) + lea 16*1($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_two: + movups ($inp),$inout0 + movups 16($inp),$inout1 + lea 32($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + + call _aesni_encrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[2],@tweak[0] + xorps @tweak[1],$inout1 + movups $inout0,($out) + movups $inout1,16*1($out) + lea 16*2($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_three: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + lea 16*3($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + + call _aesni_encrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[3],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movups $inout0,($out) + movups $inout1,16*1($out) + movups $inout2,16*2($out) + lea 16*3($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_four: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + xorps @tweak[0],$inout0 + movups 16*3($inp),$inout3 + lea 16*4($inp),$inp + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + xorps @tweak[3],$inout3 + + call _aesni_encrypt4 + + xorps @tweak[0],$inout0 + movdqa @tweak[5],@tweak[0] + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movups $inout0,($out) + xorps @tweak[3],$inout3 + movups $inout1,16*1($out) + movups $inout2,16*2($out) + movups $inout3,16*3($out) + lea 16*4($out),$out + jmp .Lxts_enc_done + +.align 16 +.Lxts_enc_done: + and \$15,$len_ + jz .Lxts_enc_ret + mov $len_,$len + +.Lxts_enc_steal: + movzb ($inp),%eax # borrow $rounds ... + movzb -16($out),%ecx # ... and $key + lea 1($inp),$inp + mov %al,-16($out) + mov %cl,0($out) + lea 1($out),$out + sub \$1,$len + jnz .Lxts_enc_steal + + sub $len_,$out # rewind $out + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups -16($out),$inout0 + xorps @tweak[0],$inout0 +___ + &aesni_generate1("enc",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movups $inout0,-16($out) + +.Lxts_enc_ret: +___ +$code.=<<___ if ($win64); + movaps 0x60(%rsp),%xmm6 + movaps 0x70(%rsp),%xmm7 + movaps 0x80(%rsp),%xmm8 + movaps 0x90(%rsp),%xmm9 + movaps 0xa0(%rsp),%xmm10 + movaps 0xb0(%rsp),%xmm11 + movaps 0xc0(%rsp),%xmm12 + movaps 0xd0(%rsp),%xmm13 + movaps 0xe0(%rsp),%xmm14 + movaps 0xf0(%rsp),%xmm15 +___ +$code.=<<___; + lea $frame_size(%rsp),%rsp +.Lxts_enc_epilogue: + ret +.size aesni_xts_encrypt,.-aesni_xts_encrypt +___ + +$code.=<<___; +.globl aesni_xts_decrypt +.type aesni_xts_decrypt,\@function,6 +.align 16 +aesni_xts_decrypt: + lea -$frame_size(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,0x60(%rsp) + movaps %xmm7,0x70(%rsp) + movaps %xmm8,0x80(%rsp) + movaps %xmm9,0x90(%rsp) + movaps %xmm10,0xa0(%rsp) + movaps %xmm11,0xb0(%rsp) + movaps %xmm12,0xc0(%rsp) + movaps %xmm13,0xd0(%rsp) + movaps %xmm14,0xe0(%rsp) + movaps %xmm15,0xf0(%rsp) +.Lxts_dec_body: +___ +$code.=<<___; + movups ($ivp),@tweak[5] # load clear-text tweak + mov 240($key2),$rounds # key2->rounds + mov 240($key),$rnds_ # key1->rounds +___ + # generate the tweak + &aesni_generate1("enc",$key2,$rounds,@tweak[5]); +$code.=<<___; + xor %eax,%eax # if ($len%16) len-=16; + test \$15,$len + setnz %al + shl \$4,%rax + sub %rax,$len + + mov $key,$key_ # backup $key + mov $rnds_,$rounds # backup $rounds + mov $len,$len_ # backup $len + and \$-16,$len + + movdqa .Lxts_magic(%rip),$twmask + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp # broadcast upper bits +___ + for ($i=0;$i<4;$i++) { + $code.=<<___; + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[$i] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,$twres # isolate carry and residue + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] +___ + } +$code.=<<___; + sub \$16*6,$len + jc .Lxts_dec_short + + shr \$1,$rounds + sub \$1,$rounds + mov $rounds,$rnds_ + jmp .Lxts_dec_grandloop + +.align 16 +.Lxts_dec_grandloop: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu `16*0`($inp),$inout0 # load input + pand $twmask,$twres # isolate carry and residue + movdqu `16*1`($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu `16*2`($inp),$inout2 + pxor @tweak[0],$inout0 # input^=tweak + movdqu `16*3`($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu `16*4`($inp),$inout4 + pxor @tweak[2],$inout2 + movdqu `16*5`($inp),$inout5 + lea `16*6`($inp),$inp + pxor @tweak[3],$inout3 + $movkey ($key_),$rndkey0 + pxor @tweak[4],$inout4 + pxor @tweak[5],$inout5 + + # inline _aesni_decrypt6 and interleave first and last rounds + # with own code... + $movkey 16($key_),$rndkey1 + pxor $rndkey0,$inout0 + pxor $rndkey0,$inout1 + movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks + aesdec $rndkey1,$inout0 + lea 32($key_),$key + pxor $rndkey0,$inout2 + movdqa @tweak[1],`16*1`(%rsp) + aesdec $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqa @tweak[2],`16*2`(%rsp) + aesdec $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqa @tweak[3],`16*3`(%rsp) + aesdec $rndkey1,$inout3 + pxor $rndkey0,$inout5 + $movkey ($key),$rndkey0 + dec $rounds + movdqa @tweak[4],`16*4`(%rsp) + aesdec $rndkey1,$inout4 + movdqa @tweak[5],`16*5`(%rsp) + aesdec $rndkey1,$inout5 + pxor $twtmp,$twtmp + pcmpgtd @tweak[5],$twtmp + jmp .Lxts_dec_loop6_enter + +.align 16 +.Lxts_dec_loop6: + aesdec $rndkey1,$inout0 + aesdec $rndkey1,$inout1 + dec $rounds + aesdec $rndkey1,$inout2 + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 +.Lxts_dec_loop6_enter: + $movkey 16($key),$rndkey1 + aesdec $rndkey0,$inout0 + aesdec $rndkey0,$inout1 + lea 32($key),$key + aesdec $rndkey0,$inout2 + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey ($key),$rndkey0 + jnz .Lxts_dec_loop6 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcast upper bits + aesdec $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + $movkey 16($key),$rndkey1 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdec $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey0,$inout3 + aesdec $rndkey0,$inout4 + aesdec $rndkey0,$inout5 + $movkey 32($key),$rndkey0 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[1] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdec $rndkey1,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdec $rndkey1,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdec $rndkey1,$inout2 + pxor $twres,@tweak[5] + aesdec $rndkey1,$inout3 + aesdec $rndkey1,$inout4 + aesdec $rndkey1,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[2] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + aesdeclast $rndkey0,$inout0 + pand $twmask,$twres # isolate carry and residue + aesdeclast $rndkey0,$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + aesdeclast $rndkey0,$inout2 + pxor $twres,@tweak[5] + aesdeclast $rndkey0,$inout3 + aesdeclast $rndkey0,$inout4 + aesdeclast $rndkey0,$inout5 + + pshufd \$0x13,$twtmp,$twres + pxor $twtmp,$twtmp + movdqa @tweak[5],@tweak[3] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + xorps `16*0`(%rsp),$inout0 # output^=tweak + pand $twmask,$twres # isolate carry and residue + xorps `16*1`(%rsp),$inout1 + pcmpgtd @tweak[5],$twtmp # broadcat upper bits + pxor $twres,@tweak[5] + + xorps `16*2`(%rsp),$inout2 + movups $inout0,`16*0`($out) # write output + xorps `16*3`(%rsp),$inout3 + movups $inout1,`16*1`($out) + xorps `16*4`(%rsp),$inout4 + movups $inout2,`16*2`($out) + xorps `16*5`(%rsp),$inout5 + movups $inout3,`16*3`($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,`16*4`($out) + movups $inout5,`16*5`($out) + lea `16*6`($out),$out + sub \$16*6,$len + jnc .Lxts_dec_grandloop + + lea 3($rounds,$rounds),$rounds # restore original value + mov $key_,$key # restore $key + mov $rounds,$rnds_ # backup $rounds + +.Lxts_dec_short: + add \$16*6,$len + jz .Lxts_dec_done + + cmp \$0x20,$len + jb .Lxts_dec_one + je .Lxts_dec_two + + cmp \$0x40,$len + jb .Lxts_dec_three + je .Lxts_dec_four + + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movdqu ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movdqu 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movdqu 16*2($inp),$inout2 + pxor @tweak[0],$inout0 + movdqu 16*3($inp),$inout3 + pxor @tweak[1],$inout1 + movdqu 16*4($inp),$inout4 + lea 16*5($inp),$inp + pxor @tweak[2],$inout2 + pxor @tweak[3],$inout3 + pxor @tweak[4],$inout4 + + call _aesni_decrypt6 + + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + movdqu $inout0,($out) + xorps @tweak[3],$inout3 + movdqu $inout1,16*1($out) + xorps @tweak[4],$inout4 + movdqu $inout2,16*2($out) + pxor $twtmp,$twtmp + movdqu $inout3,16*3($out) + pcmpgtd @tweak[5],$twtmp + movdqu $inout4,16*4($out) + lea 16*5($out),$out + pshufd \$0x13,$twtmp,@tweak[1] # $twres + and \$15,$len_ + jz .Lxts_dec_ret + + movdqa @tweak[5],@tweak[0] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + pand $twmask,@tweak[1] # isolate carry and residue + pxor @tweak[5],@tweak[1] + jmp .Lxts_dec_done2 + +.align 16 +.Lxts_dec_one: + movups ($inp),$inout0 + lea 16*1($inp),$inp + xorps @tweak[0],$inout0 ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; + xorps @tweak[0],$inout0 + movdqa @tweak[1],@tweak[0] movups $inout0,($out) - jmp .Lecb_ret + movdqa @tweak[2],@tweak[1] + lea 16*1($out),$out + jmp .Lxts_dec_done + .align 16 -.Lecb_dec_two: +.Lxts_dec_two: + movups ($inp),$inout0 + movups 16($inp),$inout1 + lea 32($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + call _aesni_decrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[2],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[3],@tweak[1] movups $inout0,($out) - movups $inout1,0x10($out) - jmp .Lecb_ret + movups $inout1,16*1($out) + lea 16*2($out),$out + jmp .Lxts_dec_done + .align 16 -.Lecb_dec_three: +.Lxts_dec_three: + movups ($inp),$inout0 + movups 16*1($inp),$inout1 + movups 16*2($inp),$inout2 + lea 16*3($inp),$inp + xorps @tweak[0],$inout0 + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + call _aesni_decrypt3 + + xorps @tweak[0],$inout0 + movdqa @tweak[3],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[5],@tweak[1] + xorps @tweak[2],$inout2 movups $inout0,($out) - movups $inout1,0x10($out) - movups $inout2,0x20($out) + movups $inout1,16*1($out) + movups $inout2,16*2($out) + lea 16*3($out),$out + jmp .Lxts_dec_done -.Lecb_ret: +.align 16 +.Lxts_dec_four: + pshufd \$0x13,$twtmp,$twres + movdqa @tweak[5],@tweak[4] + paddq @tweak[5],@tweak[5] # psllq 1,$tweak + movups ($inp),$inout0 + pand $twmask,$twres # isolate carry and residue + movups 16*1($inp),$inout1 + pxor $twres,@tweak[5] + + movups 16*2($inp),$inout2 + xorps @tweak[0],$inout0 + movups 16*3($inp),$inout3 + lea 16*4($inp),$inp + xorps @tweak[1],$inout1 + xorps @tweak[2],$inout2 + xorps @tweak[3],$inout3 + + call _aesni_decrypt4 + + xorps @tweak[0],$inout0 + movdqa @tweak[4],@tweak[0] + xorps @tweak[1],$inout1 + movdqa @tweak[5],@tweak[1] + xorps @tweak[2],$inout2 + movups $inout0,($out) + xorps @tweak[3],$inout3 + movups $inout1,16*1($out) + movups $inout2,16*2($out) + movups $inout3,16*3($out) + lea 16*4($out),$out + jmp .Lxts_dec_done + +.align 16 +.Lxts_dec_done: + and \$15,$len_ + jz .Lxts_dec_ret +.Lxts_dec_done2: + mov $len_,$len + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups ($inp),$inout0 + xorps @tweak[1],$inout0 +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[1],$inout0 + movups $inout0,($out) + +.Lxts_dec_steal: + movzb 16($inp),%eax # borrow $rounds ... + movzb ($out),%ecx # ... and $key + lea 1($inp),$inp + mov %al,($out) + mov %cl,16($out) + lea 1($out),$out + sub \$1,$len + jnz .Lxts_dec_steal + + sub $len_,$out # rewind $out + mov $key_,$key # restore $key + mov $rnds_,$rounds # restore $rounds + + movups ($out),$inout0 + xorps @tweak[0],$inout0 +___ + &aesni_generate1("dec",$key,$rounds); +$code.=<<___; + xorps @tweak[0],$inout0 + movups $inout0,($out) + +.Lxts_dec_ret: +___ +$code.=<<___ if ($win64); + movaps 0x60(%rsp),%xmm6 + movaps 0x70(%rsp),%xmm7 + movaps 0x80(%rsp),%xmm8 + movaps 0x90(%rsp),%xmm9 + movaps 0xa0(%rsp),%xmm10 + movaps 0xb0(%rsp),%xmm11 + movaps 0xc0(%rsp),%xmm12 + movaps 0xd0(%rsp),%xmm13 + movaps 0xe0(%rsp),%xmm14 + movaps 0xf0(%rsp),%xmm15 +___ +$code.=<<___; + lea $frame_size(%rsp),%rsp +.Lxts_dec_epilogue: ret -.size aesni_ecb_encrypt,.-aesni_ecb_encrypt +.size aesni_xts_decrypt,.-aesni_xts_decrypt ___ -} +} }} +######################################################################## # void $PREFIX_cbc_encrypt (const void *inp, void *out, # size_t length, const AES_KEY *key, # unsigned char *ivp,const int enc); -$reserved = $win64?0x40:-0x18; # used in decrypt +{ +my $reserved = $win64?0x40:-0x18; # used in decrypt $code.=<<___; .globl ${PREFIX}_cbc_encrypt .type ${PREFIX}_cbc_encrypt,\@function,6 @@ -371,30 +2129,30 @@ ${PREFIX}_cbc_encrypt: test $len,$len # check length jz .Lcbc_ret - mov 240($key),$rnds_ # pull $rounds + mov 240($key),$rnds_ # key->rounds mov $key,$key_ # backup $key test %r9d,%r9d # 6th argument jz .Lcbc_decrypt #--------------------------- CBC ENCRYPT ------------------------------# movups ($ivp),$inout0 # load iv as initial state - cmp \$16,$len mov $rnds_,$rounds + cmp \$16,$len jb .Lcbc_enc_tail sub \$16,$len jmp .Lcbc_enc_loop -.align 16 +.align 16 .Lcbc_enc_loop: movups ($inp),$inout1 # load input lea 16($inp),$inp - pxor $inout1,$inout0 + #xorps $inout1,$inout0 ___ - &aesni_generate1("enc",$key,$rounds); + &aesni_generate1("enc",$key,$rounds,$inout0,$inout1); $code.=<<___; - sub \$16,$len - lea 16($out),$out mov $rnds_,$rounds # restore $rounds mov $key_,$key # restore $key - movups $inout0,-16($out) # store output + movups $inout0,0($out) # store output + lea 16($out),$out + sub \$16,$len jnc .Lcbc_enc_loop add \$16,$len jnz .Lcbc_enc_tail @@ -429,92 +2187,238 @@ $code.=<<___ if ($win64); ___ $code.=<<___; movups ($ivp),$iv - sub \$0x40,$len mov $rnds_,$rounds + cmp \$0x70,$len jbe .Lcbc_dec_tail - jmp .Lcbc_dec_loop3 -.align 16 -.Lcbc_dec_loop3: - movups ($inp),$inout0 + shr \$1,$rnds_ + sub \$0x70,$len + mov $rnds_,$rounds + movaps $iv,$reserved(%rsp) + jmp .Lcbc_dec_loop8_enter +.align 16 +.Lcbc_dec_loop8: + movaps $rndkey0,$reserved(%rsp) # save IV + movups $inout7,($out) + lea 0x10($out),$out +.Lcbc_dec_loop8_enter: + $movkey ($key),$rndkey0 + movups ($inp),$inout0 # load input movups 0x10($inp),$inout1 - movups 0x20($inp),$inout2 - movaps $inout0,$in0 - movaps $inout1,$in1 - movaps $inout2,$in2 - call _aesni_decrypt3 - sub \$0x30,$len - lea 0x30($inp),$inp - lea 0x30($out),$out - pxor $iv,$inout0 - pxor $in0,$inout1 - movaps $in2,$iv - pxor $in1,$inout2 - movups $inout0,-0x30($out) - mov $rnds_,$rounds # restore $rounds - movups $inout1,-0x20($out) - mov $key_,$key # restore $key - movups $inout2,-0x10($out) - ja .Lcbc_dec_loop3 + $movkey 16($key),$rndkey1 -.Lcbc_dec_tail: - add \$0x40,$len - movups $iv,($ivp) - jz .Lcbc_dec_ret + lea 32($key),$key + movdqu 0x20($inp),$inout2 + xorps $rndkey0,$inout0 + movdqu 0x30($inp),$inout3 + xorps $rndkey0,$inout1 + movdqu 0x40($inp),$inout4 + aesdec $rndkey1,$inout0 + pxor $rndkey0,$inout2 + movdqu 0x50($inp),$inout5 + aesdec $rndkey1,$inout1 + pxor $rndkey0,$inout3 + movdqu 0x60($inp),$inout6 + aesdec $rndkey1,$inout2 + pxor $rndkey0,$inout4 + movdqu 0x70($inp),$inout7 + aesdec $rndkey1,$inout3 + pxor $rndkey0,$inout5 + dec $rounds + aesdec $rndkey1,$inout4 + pxor $rndkey0,$inout6 + aesdec $rndkey1,$inout5 + pxor $rndkey0,$inout7 + $movkey ($key),$rndkey0 + aesdec $rndkey1,$inout6 + aesdec $rndkey1,$inout7 + $movkey 16($key),$rndkey1 + call .Ldec_loop8_enter + + movups ($inp),$rndkey1 # re-load input + movups 0x10($inp),$rndkey0 + xorps $reserved(%rsp),$inout0 # ^= IV + xorps $rndkey1,$inout1 + movups 0x20($inp),$rndkey1 + xorps $rndkey0,$inout2 + movups 0x30($inp),$rndkey0 + xorps $rndkey1,$inout3 + movups 0x40($inp),$rndkey1 + xorps $rndkey0,$inout4 + movups 0x50($inp),$rndkey0 + xorps $rndkey1,$inout5 + movups 0x60($inp),$rndkey1 + xorps $rndkey0,$inout6 + movups 0x70($inp),$rndkey0 # IV + xorps $rndkey1,$inout7 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + mov $rnds_,$rounds # restore $rounds + movups $inout4,0x40($out) + mov $key_,$key # restore $key + movups $inout5,0x50($out) + lea 0x80($inp),$inp + movups $inout6,0x60($out) + lea 0x70($out),$out + sub \$0x80,$len + ja .Lcbc_dec_loop8 + + movaps $inout7,$inout0 + movaps $rndkey0,$iv + add \$0x70,$len + jle .Lcbc_dec_tail_collected + movups $inout0,($out) + lea 1($rnds_,$rnds_),$rounds + lea 0x10($out),$out +.Lcbc_dec_tail: movups ($inp),$inout0 - cmp \$0x10,$len movaps $inout0,$in0 + cmp \$0x10,$len jbe .Lcbc_dec_one + movups 0x10($inp),$inout1 - cmp \$0x20,$len movaps $inout1,$in1 + cmp \$0x20,$len jbe .Lcbc_dec_two + movups 0x20($inp),$inout2 - cmp \$0x30,$len movaps $inout2,$in2 + cmp \$0x30,$len jbe .Lcbc_dec_three + movups 0x30($inp),$inout3 - call _aesni_decrypt4 - pxor $iv,$inout0 - movups 0x30($inp),$iv - pxor $in0,$inout1 + cmp \$0x40,$len + jbe .Lcbc_dec_four + + movups 0x40($inp),$inout4 + cmp \$0x50,$len + jbe .Lcbc_dec_five + + movups 0x50($inp),$inout5 + cmp \$0x60,$len + jbe .Lcbc_dec_six + + movups 0x60($inp),$inout6 + movaps $iv,$reserved(%rsp) # save IV + call _aesni_decrypt8 + movups ($inp),$rndkey1 + movups 0x10($inp),$rndkey0 + xorps $reserved(%rsp),$inout0 # ^= IV + xorps $rndkey1,$inout1 + movups 0x20($inp),$rndkey1 + xorps $rndkey0,$inout2 + movups 0x30($inp),$rndkey0 + xorps $rndkey1,$inout3 + movups 0x40($inp),$rndkey1 + xorps $rndkey0,$inout4 + movups 0x50($inp),$rndkey0 + xorps $rndkey1,$inout5 + movups 0x60($inp),$iv # IV + xorps $rndkey0,$inout6 movups $inout0,($out) - pxor $in1,$inout2 movups $inout1,0x10($out) - pxor $in2,$inout3 movups $inout2,0x20($out) - movaps $inout3,$inout0 - lea 0x30($out),$out + movups $inout3,0x30($out) + movups $inout4,0x40($out) + movups $inout5,0x50($out) + lea 0x60($out),$out + movaps $inout6,$inout0 + sub \$0x70,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_one: ___ &aesni_generate1("dec",$key,$rounds); $code.=<<___; - pxor $iv,$inout0 + xorps $iv,$inout0 movaps $in0,$iv + sub \$0x10,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_two: + xorps $inout2,$inout2 call _aesni_decrypt3 - pxor $iv,$inout0 - pxor $in0,$inout1 + xorps $iv,$inout0 + xorps $in0,$inout1 movups $inout0,($out) movaps $in1,$iv movaps $inout1,$inout0 lea 0x10($out),$out + sub \$0x20,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_three: call _aesni_decrypt3 - pxor $iv,$inout0 - pxor $in0,$inout1 + xorps $iv,$inout0 + xorps $in0,$inout1 movups $inout0,($out) - pxor $in1,$inout2 + xorps $in1,$inout2 movups $inout1,0x10($out) movaps $in2,$iv movaps $inout2,$inout0 lea 0x20($out),$out + sub \$0x30,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_four: + call _aesni_decrypt4 + xorps $iv,$inout0 + movups 0x30($inp),$iv + xorps $in0,$inout1 + movups $inout0,($out) + xorps $in1,$inout2 + movups $inout1,0x10($out) + xorps $in2,$inout3 + movups $inout2,0x20($out) + movaps $inout3,$inout0 + lea 0x30($out),$out + sub \$0x40,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_five: + xorps $inout5,$inout5 + call _aesni_decrypt6 + movups 0x10($inp),$rndkey1 + movups 0x20($inp),$rndkey0 + xorps $iv,$inout0 + xorps $in0,$inout1 + xorps $rndkey1,$inout2 + movups 0x30($inp),$rndkey1 + xorps $rndkey0,$inout3 + movups 0x40($inp),$iv + xorps $rndkey1,$inout4 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + lea 0x40($out),$out + movaps $inout4,$inout0 + sub \$0x50,$len + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_six: + call _aesni_decrypt6 + movups 0x10($inp),$rndkey1 + movups 0x20($inp),$rndkey0 + xorps $iv,$inout0 + xorps $in0,$inout1 + xorps $rndkey1,$inout2 + movups 0x30($inp),$rndkey1 + xorps $rndkey0,$inout3 + movups 0x40($inp),$rndkey0 + xorps $rndkey1,$inout4 + movups 0x50($inp),$iv + xorps $rndkey0,$inout5 + movups $inout0,($out) + movups $inout1,0x10($out) + movups $inout2,0x20($out) + movups $inout3,0x30($out) + movups $inout4,0x40($out) + lea 0x50($out),$out + movaps $inout5,$inout0 + sub \$0x60,$len jmp .Lcbc_dec_tail_collected .align 16 .Lcbc_dec_tail_collected: @@ -523,10 +2427,12 @@ $code.=<<___; jnz .Lcbc_dec_tail_partial movups $inout0,($out) jmp .Lcbc_dec_ret +.align 16 .Lcbc_dec_tail_partial: movaps $inout0,$reserved(%rsp) + mov \$16,%rcx mov $out,%rdi - mov $len,%rcx + sub $len,%rcx lea $reserved(%rsp),%rsi .long 0x9066A4F3 # rep movsb @@ -544,7 +2450,7 @@ $code.=<<___; ret .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt ___ - +} # int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey, # int bits, AES_KEY *key) { my ($inp,$bits,$key) = @_4args; @@ -556,7 +2462,7 @@ $code.=<<___; .align 16 ${PREFIX}_set_decrypt_key: .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 - call _aesni_set_encrypt_key + call __aesni_set_encrypt_key shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key test %eax,%eax jnz .Ldec_key_ret @@ -576,9 +2482,9 @@ ${PREFIX}_set_decrypt_key: aesimc %xmm1,%xmm1 lea 16($key),$key lea -16($inp),$inp - cmp $key,$inp $movkey %xmm0,16($inp) $movkey %xmm1,-16($key) + cmp $key,$inp ja .Ldec_key_inverse $movkey ($key),%xmm0 # inverse middle @@ -605,16 +2511,16 @@ $code.=<<___; .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent .align 16 ${PREFIX}_set_encrypt_key: -_aesni_set_encrypt_key: +__aesni_set_encrypt_key: .byte 0x48,0x83,0xEC,0x08 # sub rsp,8 - test $inp,$inp mov \$-1,%rax + test $inp,$inp jz .Lenc_key_ret test $key,$key jz .Lenc_key_ret movups ($inp),%xmm0 # pull first 128 bits of *userKey - pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0 + xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0 lea 16($key),%rax cmp \$256,$bits je .L14rounds @@ -729,11 +2635,11 @@ _aesni_set_encrypt_key: lea 16(%rax),%rax .Lkey_expansion_128_cold: shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4, %xmm0 + xorps %xmm4, %xmm0 shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4, %xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 + xorps %xmm4, %xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 ret .align 16 @@ -744,11 +2650,11 @@ _aesni_set_encrypt_key: movaps %xmm2, %xmm5 .Lkey_expansion_192b_warm: shufps \$0b00010000,%xmm0,%xmm4 - movaps %xmm2,%xmm3 - pxor %xmm4,%xmm0 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 pslldq \$4,%xmm3 - pxor %xmm4,%xmm0 + xorps %xmm4,%xmm0 pshufd \$0b01010101,%xmm1,%xmm1 # critical path pxor %xmm3,%xmm2 pxor %xmm1,%xmm0 @@ -772,11 +2678,11 @@ _aesni_set_encrypt_key: lea 16(%rax),%rax .Lkey_expansion_256a_cold: shufps \$0b00010000,%xmm0,%xmm4 - pxor %xmm4,%xmm0 + xorps %xmm4,%xmm0 shufps \$0b10001100,%xmm0,%xmm4 - pxor %xmm4,%xmm0 - pshufd \$0b11111111,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm0 + xorps %xmm4,%xmm0 + shufps \$0b11111111,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm0 ret .align 16 @@ -785,17 +2691,28 @@ _aesni_set_encrypt_key: lea 16(%rax),%rax shufps \$0b00010000,%xmm2,%xmm4 - pxor %xmm4,%xmm2 + xorps %xmm4,%xmm2 shufps \$0b10001100,%xmm2,%xmm4 - pxor %xmm4,%xmm2 - pshufd \$0b10101010,%xmm1,%xmm1 # critical path - pxor %xmm1,%xmm2 + xorps %xmm4,%xmm2 + shufps \$0b10101010,%xmm1,%xmm1 # critical path + xorps %xmm1,%xmm2 ret .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key +.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key ___ } $code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lincrement32: + .long 6,6,6,0 +.Lincrement64: + .long 1,0,0,0 +.Lxts_magic: + .long 0x87,0,1,0 + .asciz "AES for Intel AES-NI, CRYPTOGAMS by " .align 64 ___ @@ -810,9 +2727,11 @@ $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind -.type cbc_se_handler,\@abi-omnipotent +___ +$code.=<<___ if ($PREFIX eq "aesni"); +.type ecb_se_handler,\@abi-omnipotent .align 16 -cbc_se_handler: +ecb_se_handler: push %rsi push %rdi push %rbx @@ -825,42 +2744,132 @@ cbc_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp + + jmp .Lcommon_seh_tail +.size ecb_se_handler,.-ecb_se_handler + +.type ccm64_se_handler,\@abi-omnipotent +.align 16 +ccm64_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip - lea .Lcbc_decrypt(%rip),%r10 - cmp %r10,%rbx # context->Rip<"prologue" label - jb .Lin_prologue + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData - lea .Lcbc_decrypt_body(%rip),%r10 - cmp %r10,%rbx # context->RipRipRip>="epilogue" label - jae .Lin_prologue + mov 152($context),%rax # pull context->Rsp - lea 0(%rax),%rsi # top of stack + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 0(%rax),%rsi # %xmm save area lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq lea 0x58(%rax),%rax # adjust stack pointer - jmp .Lin_prologue -.Lrestore_rax: - mov 120($context),%rax -.Lin_prologue: - mov 8(%rax),%rdi - mov 16(%rax),%rsi - mov %rax,152($context) # restore context->Rsp - mov %rsi,168($context) # restore context->Rsi - mov %rdi,176($context) # restore context->Rdi + jmp .Lcommon_seh_tail +.size ccm64_se_handler,.-ccm64_se_handler - jmp .Lcommon_seh_exit -.size cbc_se_handler,.-cbc_se_handler +.type ctr32_se_handler,\@abi-omnipotent +.align 16 +ctr32_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp -.type ecb_se_handler,\@abi-omnipotent + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lctr32_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lctr32_ret(%rip),%r10 + cmp %r10,%rbx + jae .Lcommon_seh_tail + + lea 0x20(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xc8(%rax),%rax # adjust stack pointer + + jmp .Lcommon_seh_tail +.size ctr32_se_handler,.-ctr32_se_handler + +.type xts_se_handler,\@abi-omnipotent .align 16 -ecb_se_handler: +xts_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue lable + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 0x60(%rax),%rsi # %xmm save area + lea 512($context),%rdi # & context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x68+160(%rax),%rax # adjust stack pointer + + jmp .Lcommon_seh_tail +.size xts_se_handler,.-xts_se_handler +___ +$code.=<<___; +.type cbc_se_handler,\@abi-omnipotent +.align 16 +cbc_se_handler: push %rsi push %rdi push %rbx @@ -873,13 +2882,37 @@ ecb_se_handler: sub \$64,%rsp mov 152($context),%rax # pull context->Rsp + mov 248($context),%rbx # pull context->Rip + + lea .Lcbc_decrypt(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lcommon_seh_tail + + lea .Lcbc_decrypt_body(%rip),%r10 + cmp %r10,%rbx # context->RipRip>="epilogue" label + jae .Lcommon_seh_tail + + lea 0(%rax),%rsi # top of stack + lea 512($context),%rdi # &context.Xmm6 + mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0x58(%rax),%rax # adjust stack pointer + jmp .Lcommon_seh_tail + +.Lrestore_cbc_rax: + mov 120($context),%rax + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi -.Lcommon_seh_exit: - mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) @@ -915,10 +2948,33 @@ ecb_se_handler: .section .pdata .align 4 - .rva .LSEH_begin_${PREFIX}_ecb_encrypt - .rva .LSEH_end_${PREFIX}_ecb_encrypt +___ +$code.=<<___ if ($PREFIX eq "aesni"); + .rva .LSEH_begin_aesni_ecb_encrypt + .rva .LSEH_end_aesni_ecb_encrypt .rva .LSEH_info_ecb + .rva .LSEH_begin_aesni_ccm64_encrypt_blocks + .rva .LSEH_end_aesni_ccm64_encrypt_blocks + .rva .LSEH_info_ccm64_enc + + .rva .LSEH_begin_aesni_ccm64_decrypt_blocks + .rva .LSEH_end_aesni_ccm64_decrypt_blocks + .rva .LSEH_info_ccm64_dec + + .rva .LSEH_begin_aesni_ctr32_encrypt_blocks + .rva .LSEH_end_aesni_ctr32_encrypt_blocks + .rva .LSEH_info_ctr32 + + .rva .LSEH_begin_aesni_xts_encrypt + .rva .LSEH_end_aesni_xts_encrypt + .rva .LSEH_info_xts_enc + + .rva .LSEH_begin_aesni_xts_decrypt + .rva .LSEH_end_aesni_xts_decrypt + .rva .LSEH_info_xts_dec +___ +$code.=<<___; .rva .LSEH_begin_${PREFIX}_cbc_encrypt .rva .LSEH_end_${PREFIX}_cbc_encrypt .rva .LSEH_info_cbc @@ -932,28 +2988,49 @@ ecb_se_handler: .rva .LSEH_info_key .section .xdata .align 8 +___ +$code.=<<___ if ($PREFIX eq "aesni"); .LSEH_info_ecb: .byte 9,0,0,0 .rva ecb_se_handler +.LSEH_info_ccm64_enc: + .byte 9,0,0,0 + .rva ccm64_se_handler + .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[] +.LSEH_info_ccm64_dec: + .byte 9,0,0,0 + .rva ccm64_se_handler + .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[] +.LSEH_info_ctr32: + .byte 9,0,0,0 + .rva ctr32_se_handler +.LSEH_info_xts_enc: + .byte 9,0,0,0 + .rva xts_se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] +.LSEH_info_xts_dec: + .byte 9,0,0,0 + .rva xts_se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] +___ +$code.=<<___; .LSEH_info_cbc: .byte 9,0,0,0 .rva cbc_se_handler .LSEH_info_key: .byte 0x01,0x04,0x01,0x00 - .byte 0x04,0x02,0x00,0x00 + .byte 0x04,0x02,0x00,0x00 # sub rsp,8 ___ } sub rex { - local *opcode=shift; - my ($dst,$src)=@_; - - if ($dst>=8 || $src>=8) { - $rex=0x40; - $rex|=0x04 if($dst>=8); - $rex|=0x01 if($src>=8); - push @opcode,$rex; - } + local *opcode=shift; + my ($dst,$src)=@_; + my $rex=0; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,$rex|0x40 if($rex); } sub aesni { @@ -989,4 +3066,3 @@ $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; print $code; close STDOUT; - diff --git a/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl new file mode 100644 index 0000000000..c9c6312fa7 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/bsaes-x86_64.pl @@ -0,0 +1,3044 @@ +#!/usr/bin/env perl + +################################################################### +### AES-128 [originally in CTR mode] ### +### bitsliced implementation for Intel Core 2 processors ### +### requires support of SSE extensions up to SSSE3 ### +### Author: Emilia Käsper and Peter Schwabe ### +### Date: 2009-03-19 ### +### Public domain ### +### ### +### See http://homes.esat.kuleuven.be/~ekasper/#software for ### +### further information. ### +################################################################### +# +# September 2011. +# +# Started as transliteration to "perlasm" the original code has +# undergone following changes: +# +# - code was made position-independent; +# - rounds were folded into a loop resulting in >5x size reduction +# from 12.5KB to 2.2KB; +# - above was possibile thanks to mixcolumns() modification that +# allowed to feed its output back to aesenc[last], this was +# achieved at cost of two additional inter-registers moves; +# - some instruction reordering and interleaving; +# - this module doesn't implement key setup subroutine, instead it +# relies on conversion of "conventional" key schedule as returned +# by AES_set_encrypt_key (see discussion below); +# - first and last round keys are treated differently, which allowed +# to skip one shiftrows(), reduce bit-sliced key schedule and +# speed-up conversion by 22%; +# - support for 192- and 256-bit keys was added; +# +# Resulting performance in CPU cycles spent to encrypt one byte out +# of 4096-byte buffer with 128-bit key is: +# +# Emilia's this(*) difference +# +# Core 2 9.30 8.69 +7% +# Nehalem(**) 7.63 6.98 +9% +# Atom 17.1 17.4 -2%(***) +# +# (*) Comparison is not completely fair, because "this" is ECB, +# i.e. no extra processing such as counter values calculation +# and xor-ing input as in Emilia's CTR implementation is +# performed. However, the CTR calculations stand for not more +# than 1% of total time, so comparison is *rather* fair. +# +# (**) Results were collected on Westmere, which is considered to +# be equivalent to Nehalem for this code. +# +# (***) Slowdown on Atom is rather strange per se, because original +# implementation has a number of 9+-bytes instructions, which +# are bad for Atom front-end, and which I eliminated completely. +# In attempt to address deterioration sbox() was tested in FP +# SIMD "domain" (movaps instead of movdqa, xorps instead of +# pxor, etc.). While it resulted in nominal 4% improvement on +# Atom, it hurted Westmere by more than 2x factor. +# +# As for key schedule conversion subroutine. Interface to OpenSSL +# relies on per-invocation on-the-fly conversion. This naturally +# has impact on performance, especially for short inputs. Conversion +# time in CPU cycles and its ratio to CPU cycles spent in 8x block +# function is: +# +# conversion conversion/8x block +# Core 2 240 0.22 +# Nehalem 180 0.20 +# Atom 430 0.19 +# +# The ratio values mean that 128-byte blocks will be processed +# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%, +# etc. Then keep in mind that input sizes not divisible by 128 are +# *effectively* slower, especially shortest ones, e.g. consecutive +# 144-byte blocks are processed 44% slower than one would expect, +# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings" +# it's still faster than ["hyper-threading-safe" code path in] +# aes-x86_64.pl on all lengths above 64 bytes... +# +# October 2011. +# +# Add decryption procedure. Performance in CPU cycles spent to decrypt +# one byte out of 4096-byte buffer with 128-bit key is: +# +# Core 2 11.0 +# Nehalem 9.16 +# Atom 20.9 +# +# November 2011. +# +# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is +# suboptimal, but XTS is meant to be used with larger blocks... +# +# + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx"); +my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15) +my $ecb=0; # suppress unreferenced ECB subroutines, spare some space... + +{ +my ($key,$rounds,$const)=("%rax","%r10d","%r11"); + +sub Sbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InBasisChange (@b); + &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s); + &OutBasisChange (@b[7,1,4,2,6,5,0,3]); +} + +sub InBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb +my @b=@_[0..7]; +$code.=<<___; + pxor @b[6], @b[5] + pxor @b[1], @b[2] + pxor @b[0], @b[3] + pxor @b[2], @b[6] + pxor @b[0], @b[5] + + pxor @b[3], @b[6] + pxor @b[7], @b[3] + pxor @b[5], @b[7] + pxor @b[4], @b[3] + pxor @b[5], @b[4] + pxor @b[1], @b[3] + + pxor @b[7], @b[2] + pxor @b[5], @b[1] +___ +} + +sub OutBasisChange { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb +my @b=@_[0..7]; +$code.=<<___; + pxor @b[6], @b[0] + pxor @b[4], @b[1] + pxor @b[0], @b[2] + pxor @b[6], @b[4] + pxor @b[1], @b[6] + + pxor @b[5], @b[1] + pxor @b[3], @b[5] + pxor @b[7], @b[3] + pxor @b[5], @b[7] + pxor @b[5], @b[2] + + pxor @b[7], @b[4] +___ +} + +sub InvSbox { +# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb +# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb +my @b=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; + &InvInBasisChange (@b); + &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s); + &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]); +} + +sub InvInBasisChange { # OutBasisChange in reverse +my @b=@_[5,1,2,6,3,7,0,4]; +$code.=<<___ + pxor @b[7], @b[4] + + pxor @b[5], @b[7] + pxor @b[5], @b[2] + pxor @b[7], @b[3] + pxor @b[3], @b[5] + pxor @b[5], @b[1] + + pxor @b[1], @b[6] + pxor @b[0], @b[2] + pxor @b[6], @b[4] + pxor @b[6], @b[0] + pxor @b[4], @b[1] +___ +} + +sub InvOutBasisChange { # InBasisChange in reverse +my @b=@_[2,5,7,3,6,1,0,4]; +$code.=<<___; + pxor @b[5], @b[1] + pxor @b[7], @b[2] + + pxor @b[1], @b[3] + pxor @b[5], @b[4] + pxor @b[5], @b[7] + pxor @b[4], @b[3] + pxor @b[0], @b[5] + pxor @b[7], @b[3] + pxor @b[2], @b[6] + pxor @b[1], @b[2] + pxor @b[3], @b[6] + + pxor @b[0], @b[3] + pxor @b[6], @b[5] +___ +} + +sub Mul_GF4 { +#;************************************************************* +#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) * +#;************************************************************* +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + movdqa $y0, $t0 + pxor $y1, $t0 + pand $x0, $t0 + pxor $x1, $x0 + pand $y0, $x1 + pand $y1, $x0 + pxor $x1, $x0 + pxor $t0, $x1 +___ +} + +sub Mul_GF4_N { # not used, see next subroutine +# multiply and scale by N +my ($x0,$x1,$y0,$y1,$t0)=@_; +$code.=<<___; + movdqa $y0, $t0 + pxor $y1, $t0 + pand $x0, $t0 + pxor $x1, $x0 + pand $y0, $x1 + pand $y1, $x0 + pxor $x0, $x1 + pxor $t0, $x0 +___ +} + +sub Mul_GF4_N_GF4 { +# interleaved Mul_GF4_N and Mul_GF4 +my ($x0,$x1,$y0,$y1,$t0, + $x2,$x3,$y2,$y3,$t1)=@_; +$code.=<<___; + movdqa $y0, $t0 + movdqa $y2, $t1 + pxor $y1, $t0 + pxor $y3, $t1 + pand $x0, $t0 + pand $x2, $t1 + pxor $x1, $x0 + pxor $x3, $x2 + pand $y0, $x1 + pand $y2, $x3 + pand $y1, $x0 + pand $y3, $x2 + pxor $x0, $x1 + pxor $x3, $x2 + pxor $t0, $x0 + pxor $t1, $x3 +___ +} +sub Mul_GF16_2 { +my @x=@_[0..7]; +my @y=@_[8..11]; +my @t=@_[12..15]; +$code.=<<___; + movdqa @x[0], @t[0] + movdqa @x[1], @t[1] +___ + &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]); +$code.=<<___; + pxor @x[2], @t[0] + pxor @x[3], @t[1] + pxor @y[2], @y[0] + pxor @y[3], @y[1] +___ + Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[2], @x[3], @y[2], @y[3], @t[2]); +$code.=<<___; + pxor @t[0], @x[0] + pxor @t[0], @x[2] + pxor @t[1], @x[1] + pxor @t[1], @x[3] + + movdqa @x[4], @t[0] + movdqa @x[5], @t[1] + pxor @x[6], @t[0] + pxor @x[7], @t[1] +___ + &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3], + @x[6], @x[7], @y[2], @y[3], @t[2]); +$code.=<<___; + pxor @y[2], @y[0] + pxor @y[3], @y[1] +___ + &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]); +$code.=<<___; + pxor @t[0], @x[4] + pxor @t[0], @x[6] + pxor @t[1], @x[5] + pxor @t[1], @x[7] +___ +} +sub Inv_GF256 { +#;******************************************************************** +#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) * +#;******************************************************************** +my @x=@_[0..7]; +my @t=@_[8..11]; +my @s=@_[12..15]; +# direct optimizations from hardware +$code.=<<___; + movdqa @x[4], @t[3] + movdqa @x[5], @t[2] + movdqa @x[1], @t[1] + movdqa @x[7], @s[1] + movdqa @x[0], @s[0] + + pxor @x[6], @t[3] + pxor @x[7], @t[2] + pxor @x[3], @t[1] + movdqa @t[3], @s[2] + pxor @x[6], @s[1] + movdqa @t[2], @t[0] + pxor @x[2], @s[0] + movdqa @t[3], @s[3] + + por @t[1], @t[2] + por @s[0], @t[3] + pxor @t[0], @s[3] + pand @s[0], @s[2] + pxor @t[1], @s[0] + pand @t[1], @t[0] + pand @s[0], @s[3] + movdqa @x[3], @s[0] + pxor @x[2], @s[0] + pand @s[0], @s[1] + pxor @s[1], @t[3] + pxor @s[1], @t[2] + movdqa @x[4], @s[1] + movdqa @x[1], @s[0] + pxor @x[5], @s[1] + pxor @x[0], @s[0] + movdqa @s[1], @t[1] + pand @s[0], @s[1] + por @s[0], @t[1] + pxor @s[1], @t[0] + pxor @s[3], @t[3] + pxor @s[2], @t[2] + pxor @s[3], @t[1] + movdqa @x[7], @s[0] + pxor @s[2], @t[0] + movdqa @x[6], @s[1] + pxor @s[2], @t[1] + movdqa @x[5], @s[2] + pand @x[3], @s[0] + movdqa @x[4], @s[3] + pand @x[2], @s[1] + pand @x[1], @s[2] + por @x[0], @s[3] + pxor @s[0], @t[3] + pxor @s[1], @t[2] + pxor @s[2], @t[1] + pxor @s[3], @t[0] + + #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3 + + # new smaller inversion + + movdqa @t[3], @s[0] + pand @t[1], @t[3] + pxor @t[2], @s[0] + + movdqa @t[0], @s[2] + movdqa @s[0], @s[3] + pxor @t[3], @s[2] + pand @s[2], @s[3] + + movdqa @t[1], @s[1] + pxor @t[2], @s[3] + pxor @t[0], @s[1] + + pxor @t[2], @t[3] + + pand @t[3], @s[1] + + movdqa @s[2], @t[2] + pxor @t[0], @s[1] + + pxor @s[1], @t[2] + pxor @s[1], @t[1] + + pand @t[0], @t[2] + + pxor @t[2], @s[2] + pxor @t[2], @t[1] + + pand @s[3], @s[2] + + pxor @s[0], @s[2] +___ +# output in s3, s2, s1, t1 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3 + +# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 + &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]); + +### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb +} + +# AES linear components + +sub ShiftRows { +my @x=@_[0..7]; +my $mask=pop; +$code.=<<___; + pxor 0x00($key),@x[0] + pxor 0x10($key),@x[1] + pshufb $mask,@x[0] + pxor 0x20($key),@x[2] + pshufb $mask,@x[1] + pxor 0x30($key),@x[3] + pshufb $mask,@x[2] + pxor 0x40($key),@x[4] + pshufb $mask,@x[3] + pxor 0x50($key),@x[5] + pshufb $mask,@x[4] + pxor 0x60($key),@x[6] + pshufb $mask,@x[5] + pxor 0x70($key),@x[7] + pshufb $mask,@x[6] + lea 0x80($key),$key + pshufb $mask,@x[7] +___ +} + +sub MixColumns { +# modified to emit output in order suitable for feeding back to aesenc[last] +my @x=@_[0..7]; +my @t=@_[8..15]; +$code.=<<___; + pshufd \$0x93, @x[0], @t[0] # x0 <<< 32 + pshufd \$0x93, @x[1], @t[1] + pxor @t[0], @x[0] # x0 ^ (x0 <<< 32) + pshufd \$0x93, @x[2], @t[2] + pxor @t[1], @x[1] + pshufd \$0x93, @x[3], @t[3] + pxor @t[2], @x[2] + pshufd \$0x93, @x[4], @t[4] + pxor @t[3], @x[3] + pshufd \$0x93, @x[5], @t[5] + pxor @t[4], @x[4] + pshufd \$0x93, @x[6], @t[6] + pxor @t[5], @x[5] + pshufd \$0x93, @x[7], @t[7] + pxor @t[6], @x[6] + pxor @t[7], @x[7] + + pxor @x[0], @t[1] + pxor @x[7], @t[0] + pxor @x[7], @t[1] + pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64) + pxor @x[1], @t[2] + pshufd \$0x4E, @x[1], @x[1] + pxor @x[4], @t[5] + pxor @t[0], @x[0] + pxor @x[5], @t[6] + pxor @t[1], @x[1] + pxor @x[3], @t[4] + pshufd \$0x4E, @x[4], @t[0] + pxor @x[6], @t[7] + pshufd \$0x4E, @x[5], @t[1] + pxor @x[2], @t[3] + pshufd \$0x4E, @x[3], @x[4] + pxor @x[7], @t[3] + pshufd \$0x4E, @x[7], @x[5] + pxor @x[7], @t[4] + pshufd \$0x4E, @x[6], @x[3] + pxor @t[4], @t[0] + pshufd \$0x4E, @x[2], @x[6] + pxor @t[5], @t[1] + + pxor @t[3], @x[4] + pxor @t[7], @x[5] + pxor @t[6], @x[3] + movdqa @t[0], @x[2] + pxor @t[2], @x[6] + movdqa @t[1], @x[7] +___ +} + +sub InvMixColumns { +my @x=@_[0..7]; +my @t=@_[8..15]; + +$code.=<<___; + # multiplication by 0x0e + pshufd \$0x93, @x[7], @t[7] + movdqa @x[2], @t[2] + pxor @x[5], @x[7] # 7 5 + pxor @x[5], @x[2] # 2 5 + pshufd \$0x93, @x[0], @t[0] + movdqa @x[5], @t[5] + pxor @x[0], @x[5] # 5 0 [1] + pxor @x[1], @x[0] # 0 1 + pshufd \$0x93, @x[1], @t[1] + pxor @x[2], @x[1] # 1 25 + pxor @x[6], @x[0] # 01 6 [2] + pxor @x[3], @x[1] # 125 3 [4] + pshufd \$0x93, @x[3], @t[3] + pxor @x[0], @x[2] # 25 016 [3] + pxor @x[7], @x[3] # 3 75 + pxor @x[6], @x[7] # 75 6 [0] + pshufd \$0x93, @x[6], @t[6] + movdqa @x[4], @t[4] + pxor @x[4], @x[6] # 6 4 + pxor @x[3], @x[4] # 4 375 [6] + pxor @x[7], @x[3] # 375 756=36 + pxor @t[5], @x[6] # 64 5 [7] + pxor @t[2], @x[3] # 36 2 + pxor @t[4], @x[3] # 362 4 [5] + pshufd \$0x93, @t[5], @t[5] +___ + my @y = @x[7,5,0,2,1,3,4,6]; +$code.=<<___; + # multiplication by 0x0b + pxor @y[0], @y[1] + pxor @t[0], @y[0] + pxor @t[1], @y[1] + pshufd \$0x93, @t[2], @t[2] + pxor @t[5], @y[0] + pxor @t[6], @y[1] + pxor @t[7], @y[0] + pshufd \$0x93, @t[4], @t[4] + pxor @t[6], @t[7] # clobber t[7] + pxor @y[0], @y[1] + + pxor @t[0], @y[3] + pshufd \$0x93, @t[0], @t[0] + pxor @t[1], @y[2] + pxor @t[1], @y[4] + pxor @t[2], @y[2] + pshufd \$0x93, @t[1], @t[1] + pxor @t[2], @y[3] + pxor @t[2], @y[5] + pxor @t[7], @y[2] + pshufd \$0x93, @t[2], @t[2] + pxor @t[3], @y[3] + pxor @t[3], @y[6] + pxor @t[3], @y[4] + pshufd \$0x93, @t[3], @t[3] + pxor @t[4], @y[7] + pxor @t[4], @y[5] + pxor @t[7], @y[7] + pxor @t[5], @y[3] + pxor @t[4], @y[4] + pxor @t[5], @t[7] # clobber t[7] even more + + pxor @t[7], @y[5] + pshufd \$0x93, @t[4], @t[4] + pxor @t[7], @y[6] + pxor @t[7], @y[4] + + pxor @t[5], @t[7] + pshufd \$0x93, @t[5], @t[5] + pxor @t[6], @t[7] # restore t[7] + + # multiplication by 0x0d + pxor @y[7], @y[4] + pxor @t[4], @y[7] + pshufd \$0x93, @t[6], @t[6] + pxor @t[0], @y[2] + pxor @t[5], @y[7] + pxor @t[2], @y[2] + pshufd \$0x93, @t[7], @t[7] + + pxor @y[1], @y[3] + pxor @t[1], @y[1] + pxor @t[0], @y[0] + pxor @t[0], @y[3] + pxor @t[5], @y[1] + pxor @t[5], @y[0] + pxor @t[7], @y[1] + pshufd \$0x93, @t[0], @t[0] + pxor @t[6], @y[0] + pxor @y[1], @y[3] + pxor @t[1], @y[4] + pshufd \$0x93, @t[1], @t[1] + + pxor @t[7], @y[7] + pxor @t[2], @y[4] + pxor @t[2], @y[5] + pshufd \$0x93, @t[2], @t[2] + pxor @t[6], @y[2] + pxor @t[3], @t[6] # clobber t[6] + pxor @y[7], @y[4] + pxor @t[6], @y[3] + + pxor @t[6], @y[6] + pxor @t[5], @y[5] + pxor @t[4], @y[6] + pshufd \$0x93, @t[4], @t[4] + pxor @t[6], @y[5] + pxor @t[7], @y[6] + pxor @t[3], @t[6] # restore t[6] + + pshufd \$0x93, @t[5], @t[5] + pshufd \$0x93, @t[6], @t[6] + pshufd \$0x93, @t[7], @t[7] + pshufd \$0x93, @t[3], @t[3] + + # multiplication by 0x09 + pxor @y[1], @y[4] + pxor @y[1], @t[1] # t[1]=y[1] + pxor @t[5], @t[0] # clobber t[0] + pxor @t[5], @t[1] + pxor @t[0], @y[3] + pxor @y[0], @t[0] # t[0]=y[0] + pxor @t[6], @t[1] + pxor @t[7], @t[6] # clobber t[6] + pxor @t[1], @y[4] + pxor @t[4], @y[7] + pxor @y[4], @t[4] # t[4]=y[4] + pxor @t[3], @y[6] + pxor @y[3], @t[3] # t[3]=y[3] + pxor @t[2], @y[5] + pxor @y[2], @t[2] # t[2]=y[2] + pxor @t[7], @t[3] + pxor @y[5], @t[5] # t[5]=y[5] + pxor @t[6], @t[2] + pxor @t[6], @t[5] + pxor @y[6], @t[6] # t[6]=y[6] + pxor @y[7], @t[7] # t[7]=y[7] + + movdqa @t[0],@XMM[0] + movdqa @t[1],@XMM[1] + movdqa @t[2],@XMM[2] + movdqa @t[3],@XMM[3] + movdqa @t[4],@XMM[4] + movdqa @t[5],@XMM[5] + movdqa @t[6],@XMM[6] + movdqa @t[7],@XMM[7] +___ +} + +sub aesenc { # not used +my @b=@_[0..7]; +my @t=@_[8..15]; +$code.=<<___; + movdqa 0x30($const),@t[0] # .LSR +___ + &ShiftRows (@b,@t[0]); + &Sbox (@b,@t); + &MixColumns (@b[0,1,4,6,3,7,2,5],@t); +} + +sub aesenclast { # not used +my @b=@_[0..7]; +my @t=@_[8..15]; +$code.=<<___; + movdqa 0x40($const),@t[0] # .LSRM0 +___ + &ShiftRows (@b,@t[0]); + &Sbox (@b,@t); +$code.=<<___ + pxor 0x00($key),@b[0] + pxor 0x10($key),@b[1] + pxor 0x20($key),@b[4] + pxor 0x30($key),@b[6] + pxor 0x40($key),@b[3] + pxor 0x50($key),@b[7] + pxor 0x60($key),@b[2] + pxor 0x70($key),@b[5] +___ +} + +sub swapmove { +my ($a,$b,$n,$mask,$t)=@_; +$code.=<<___; + movdqa $b,$t + psrlq \$$n,$b + pxor $a,$b + pand $mask,$b + pxor $b,$a + psllq \$$n,$b + pxor $t,$b +___ +} +sub swapmove2x { +my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_; +$code.=<<___; + movdqa $b0,$t0 + psrlq \$$n,$b0 + movdqa $b1,$t1 + psrlq \$$n,$b1 + pxor $a0,$b0 + pxor $a1,$b1 + pand $mask,$b0 + pand $mask,$b1 + pxor $b0,$a0 + psllq \$$n,$b0 + pxor $b1,$a1 + psllq \$$n,$b1 + pxor $t0,$b0 + pxor $t1,$b1 +___ +} + +sub bitslice { +my @x=reverse(@_[0..7]); +my ($t0,$t1,$t2,$t3)=@_[8..11]; +$code.=<<___; + movdqa 0x00($const),$t0 # .LBS0 + movdqa 0x10($const),$t1 # .LBS1 +___ + &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3); + &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); +$code.=<<___; + movdqa 0x20($const),$t0 # .LBS2 +___ + &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3); + &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + + &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3); + &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3); +} + +$code.=<<___; +.text + +.extern asm_AES_encrypt +.extern asm_AES_decrypt + +.type _bsaes_encrypt8,\@abi-omnipotent +.align 64 +_bsaes_encrypt8: + lea .LBS0(%rip), $const # constants table + + movdqa ($key), @XMM[9] # round 0 key + lea 0x10($key), $key + movdqa 0x50($const), @XMM[8] # .LM0SR + pxor @XMM[9], @XMM[0] # xor with round0 key + pxor @XMM[9], @XMM[1] + pshufb @XMM[8], @XMM[0] + pxor @XMM[9], @XMM[2] + pshufb @XMM[8], @XMM[1] + pxor @XMM[9], @XMM[3] + pshufb @XMM[8], @XMM[2] + pxor @XMM[9], @XMM[4] + pshufb @XMM[8], @XMM[3] + pxor @XMM[9], @XMM[5] + pshufb @XMM[8], @XMM[4] + pxor @XMM[9], @XMM[6] + pshufb @XMM[8], @XMM[5] + pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[6] + pshufb @XMM[8], @XMM[7] +_bsaes_encrypt8_bitslice: +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + dec $rounds + jmp .Lenc_sbox +.align 16 +.Lenc_loop: +___ + &ShiftRows (@XMM[0..7, 8]); +$code.=".Lenc_sbox:\n"; + &Sbox (@XMM[0..7, 8..15]); +$code.=<<___; + dec $rounds + jl .Lenc_done +___ + &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]); +$code.=<<___; + movdqa 0x30($const), @XMM[8] # .LSR + jnz .Lenc_loop + movdqa 0x40($const), @XMM[8] # .LSRM0 + jmp .Lenc_loop +.align 16 +.Lenc_done: +___ + # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb + &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]); +$code.=<<___; + movdqa ($key), @XMM[8] # last round key + pxor @XMM[8], @XMM[4] + pxor @XMM[8], @XMM[6] + pxor @XMM[8], @XMM[3] + pxor @XMM[8], @XMM[7] + pxor @XMM[8], @XMM[2] + pxor @XMM[8], @XMM[5] + pxor @XMM[8], @XMM[0] + pxor @XMM[8], @XMM[1] + ret +.size _bsaes_encrypt8,.-_bsaes_encrypt8 + +.type _bsaes_decrypt8,\@abi-omnipotent +.align 64 +_bsaes_decrypt8: + lea .LBS0(%rip), $const # constants table + + movdqa ($key), @XMM[9] # round 0 key + lea 0x10($key), $key + movdqa -0x30($const), @XMM[8] # .LM0ISR + pxor @XMM[9], @XMM[0] # xor with round0 key + pxor @XMM[9], @XMM[1] + pshufb @XMM[8], @XMM[0] + pxor @XMM[9], @XMM[2] + pshufb @XMM[8], @XMM[1] + pxor @XMM[9], @XMM[3] + pshufb @XMM[8], @XMM[2] + pxor @XMM[9], @XMM[4] + pshufb @XMM[8], @XMM[3] + pxor @XMM[9], @XMM[5] + pshufb @XMM[8], @XMM[4] + pxor @XMM[9], @XMM[6] + pshufb @XMM[8], @XMM[5] + pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[6] + pshufb @XMM[8], @XMM[7] +___ + &bitslice (@XMM[0..7, 8..11]); +$code.=<<___; + dec $rounds + jmp .Ldec_sbox +.align 16 +.Ldec_loop: +___ + &ShiftRows (@XMM[0..7, 8]); +$code.=".Ldec_sbox:\n"; + &InvSbox (@XMM[0..7, 8..15]); +$code.=<<___; + dec $rounds + jl .Ldec_done +___ + &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]); +$code.=<<___; + movdqa -0x10($const), @XMM[8] # .LISR + jnz .Ldec_loop + movdqa -0x20($const), @XMM[8] # .LISRM0 + jmp .Ldec_loop +.align 16 +.Ldec_done: +___ + &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]); +$code.=<<___; + movdqa ($key), @XMM[8] # last round key + pxor @XMM[8], @XMM[6] + pxor @XMM[8], @XMM[4] + pxor @XMM[8], @XMM[2] + pxor @XMM[8], @XMM[7] + pxor @XMM[8], @XMM[3] + pxor @XMM[8], @XMM[5] + pxor @XMM[8], @XMM[0] + pxor @XMM[8], @XMM[1] + ret +.size _bsaes_decrypt8,.-_bsaes_decrypt8 +___ +} +{ +my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11"); + +sub bitslice_key { +my @x=reverse(@_[0..7]); +my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12]; + + &swapmove (@x[0,1],1,$bs0,$t2,$t3); +$code.=<<___; + #&swapmove(@x[2,3],1,$t0,$t2,$t3); + movdqa @x[0], @x[2] + movdqa @x[1], @x[3] +___ + #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3); + + &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3); +$code.=<<___; + #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3); + movdqa @x[0], @x[4] + movdqa @x[2], @x[6] + movdqa @x[1], @x[5] + movdqa @x[3], @x[7] +___ + &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3); + &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3); +} + +$code.=<<___; +.type _bsaes_key_convert,\@abi-omnipotent +.align 16 +_bsaes_key_convert: + lea .Lmasks(%rip), $const + movdqu ($inp), %xmm7 # load round 0 key + lea 0x10($inp), $inp + movdqa 0x00($const), %xmm0 # 0x01... + movdqa 0x10($const), %xmm1 # 0x02... + movdqa 0x20($const), %xmm2 # 0x04... + movdqa 0x30($const), %xmm3 # 0x08... + movdqa 0x40($const), %xmm4 # .LM0 + pcmpeqd %xmm5, %xmm5 # .LNOT + + movdqu ($inp), %xmm6 # load round 1 key + movdqa %xmm7, ($out) # save round 0 key + lea 0x10($out), $out + dec $rounds + jmp .Lkey_loop +.align 16 +.Lkey_loop: + pshufb %xmm4, %xmm6 # .LM0 + + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm9 + + pand %xmm6, %xmm8 + pand %xmm6, %xmm9 + movdqa %xmm2, %xmm10 + pcmpeqb %xmm0, %xmm8 + psllq \$4, %xmm0 # 0x10... + movdqa %xmm3, %xmm11 + pcmpeqb %xmm1, %xmm9 + psllq \$4, %xmm1 # 0x20... + + pand %xmm6, %xmm10 + pand %xmm6, %xmm11 + movdqa %xmm0, %xmm12 + pcmpeqb %xmm2, %xmm10 + psllq \$4, %xmm2 # 0x40... + movdqa %xmm1, %xmm13 + pcmpeqb %xmm3, %xmm11 + psllq \$4, %xmm3 # 0x80... + + movdqa %xmm2, %xmm14 + movdqa %xmm3, %xmm15 + pxor %xmm5, %xmm8 # "pnot" + pxor %xmm5, %xmm9 + + pand %xmm6, %xmm12 + pand %xmm6, %xmm13 + movdqa %xmm8, 0x00($out) # write bit-sliced round key + pcmpeqb %xmm0, %xmm12 + psrlq \$4, %xmm0 # 0x01... + movdqa %xmm9, 0x10($out) + pcmpeqb %xmm1, %xmm13 + psrlq \$4, %xmm1 # 0x02... + lea 0x10($inp), $inp + + pand %xmm6, %xmm14 + pand %xmm6, %xmm15 + movdqa %xmm10, 0x20($out) + pcmpeqb %xmm2, %xmm14 + psrlq \$4, %xmm2 # 0x04... + movdqa %xmm11, 0x30($out) + pcmpeqb %xmm3, %xmm15 + psrlq \$4, %xmm3 # 0x08... + movdqu ($inp), %xmm6 # load next round key + + pxor %xmm5, %xmm13 # "pnot" + pxor %xmm5, %xmm14 + movdqa %xmm12, 0x40($out) + movdqa %xmm13, 0x50($out) + movdqa %xmm14, 0x60($out) + movdqa %xmm15, 0x70($out) + lea 0x80($out),$out + dec $rounds + jnz .Lkey_loop + + movdqa 0x50($const), %xmm7 # .L63 + #movdqa %xmm6, ($out) # don't save last round key + ret +.size _bsaes_key_convert,.-_bsaes_key_convert +___ +} + +if (0 && !$win64) { # following four functions are unsupported interface + # used for benchmarking... +$code.=<<___; +.globl bsaes_enc_key_convert +.type bsaes_enc_key_convert,\@function,2 +.align 16 +bsaes_enc_key_convert: + mov 240($inp),%r10d # pass rounds + mov $inp,%rcx # pass key + mov $out,%rax # pass key schedule + call _bsaes_key_convert + pxor %xmm6,%xmm7 # fix up last round key + movdqa %xmm7,(%rax) # save last round key + ret +.size bsaes_enc_key_convert,.-bsaes_enc_key_convert + +.globl bsaes_encrypt_128 +.type bsaes_encrypt_128,\@function,4 +.align 16 +bsaes_encrypt_128: +.Lenc128_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + movdqu 0x60($inp), @XMM[6] + movdqu 0x70($inp), @XMM[7] + mov $key, %rax # pass the $key + lea 0x80($inp), $inp + mov \$10,%r10d + + call _bsaes_encrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$0x80,$len + ja .Lenc128_loop + ret +.size bsaes_encrypt_128,.-bsaes_encrypt_128 + +.globl bsaes_dec_key_convert +.type bsaes_dec_key_convert,\@function,2 +.align 16 +bsaes_dec_key_convert: + mov 240($inp),%r10d # pass rounds + mov $inp,%rcx # pass key + mov $out,%rax # pass key schedule + call _bsaes_key_convert + pxor ($out),%xmm7 # fix up round 0 key + movdqa %xmm6,(%rax) # save last round key + movdqa %xmm7,($out) + ret +.size bsaes_dec_key_convert,.-bsaes_dec_key_convert + +.globl bsaes_decrypt_128 +.type bsaes_decrypt_128,\@function,4 +.align 16 +bsaes_decrypt_128: +.Ldec128_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + movdqu 0x60($inp), @XMM[6] + movdqu 0x70($inp), @XMM[7] + mov $key, %rax # pass the $key + lea 0x80($inp), $inp + mov \$10,%r10d + + call _bsaes_decrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$0x80,$len + ja .Ldec128_loop + ret +.size bsaes_decrypt_128,.-bsaes_decrypt_128 +___ +} +{ +###################################################################### +# +# OpenSSL interface +# +my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d") + : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); +my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15"); + +if ($ecb) { +$code.=<<___; +.globl bsaes_ecb_encrypt_blocks +.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent +.align 16 +bsaes_ecb_encrypt_blocks: + mov %rsp, %rax +.Lecb_enc_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp),%rsp +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lecb_enc_body: +___ +$code.=<<___; + mov %rsp,%rbp # backup %rsp + mov 240($arg4),%eax # rounds + mov $arg1,$inp # backup arguments + mov $arg2,$out + mov $arg3,$len + mov $arg4,$key + cmp \$8,$arg3 + jb .Lecb_enc_short + + mov %eax,%ebx # backup rounds + shl \$7,%rax # 128 bytes per inner round key + sub \$`128-32`,%rax # size of bit-sliced key schedule + sub %rax,%rsp + mov %rsp,%rax # pass key schedule + mov $key,%rcx # pass key + mov %ebx,%r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6,%xmm7 # fix up last round key + movdqa %xmm7,(%rax) # save last round key + + sub \$8,$len +.Lecb_enc_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + mov %rsp, %rax # pass key schedule + movdqu 0x60($inp), @XMM[6] + mov %ebx,%r10d # pass rounds + movdqu 0x70($inp), @XMM[7] + lea 0x80($inp), $inp + + call _bsaes_encrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$8,$len + jnc .Lecb_enc_loop + + add \$8,$len + jz .Lecb_enc_done + + movdqu 0x00($inp), @XMM[0] # load input + mov %rsp, %rax # pass key schedule + mov %ebx,%r10d # pass rounds + cmp \$2,$len + jb .Lecb_enc_one + movdqu 0x10($inp), @XMM[1] + je .Lecb_enc_two + movdqu 0x20($inp), @XMM[2] + cmp \$4,$len + jb .Lecb_enc_three + movdqu 0x30($inp), @XMM[3] + je .Lecb_enc_four + movdqu 0x40($inp), @XMM[4] + cmp \$6,$len + jb .Lecb_enc_five + movdqu 0x50($inp), @XMM[5] + je .Lecb_enc_six + movdqu 0x60($inp), @XMM[6] + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_six: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_five: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_four: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_three: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_two: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_one: + call _bsaes_encrypt8 + movdqu @XMM[0], 0x00($out) # write output + jmp .Lecb_enc_done +.align 16 +.Lecb_enc_short: + lea ($inp), $arg1 + lea ($out), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt + lea 16($inp), $inp + lea 16($out), $out + dec $len + jnz .Lecb_enc_short + +.Lecb_enc_done: + lea (%rsp),%rax + pxor %xmm0, %xmm0 +.Lecb_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + jb .Lecb_enc_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lecb_enc_epilogue: + ret +.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks + +.globl bsaes_ecb_decrypt_blocks +.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent +.align 16 +bsaes_ecb_decrypt_blocks: + mov %rsp, %rax +.Lecb_dec_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp),%rsp +___ +$code.=<<___ if ($win64); + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lecb_dec_body: +___ +$code.=<<___; + mov %rsp,%rbp # backup %rsp + mov 240($arg4),%eax # rounds + mov $arg1,$inp # backup arguments + mov $arg2,$out + mov $arg3,$len + mov $arg4,$key + cmp \$8,$arg3 + jb .Lecb_dec_short + + mov %eax,%ebx # backup rounds + shl \$7,%rax # 128 bytes per inner round key + sub \$`128-32`,%rax # size of bit-sliced key schedule + sub %rax,%rsp + mov %rsp,%rax # pass key schedule + mov $key,%rcx # pass key + mov %ebx,%r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp),%xmm7 # fix up 0 round key + movdqa %xmm6,(%rax) # save last round key + movdqa %xmm7,(%rsp) + + sub \$8,$len +.Lecb_dec_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + mov %rsp, %rax # pass key schedule + movdqu 0x60($inp), @XMM[6] + mov %ebx,%r10d # pass rounds + movdqu 0x70($inp), @XMM[7] + lea 0x80($inp), $inp + + call _bsaes_decrypt8 + + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$8,$len + jnc .Lecb_dec_loop + + add \$8,$len + jz .Lecb_dec_done + + movdqu 0x00($inp), @XMM[0] # load input + mov %rsp, %rax # pass key schedule + mov %ebx,%r10d # pass rounds + cmp \$2,$len + jb .Lecb_dec_one + movdqu 0x10($inp), @XMM[1] + je .Lecb_dec_two + movdqu 0x20($inp), @XMM[2] + cmp \$4,$len + jb .Lecb_dec_three + movdqu 0x30($inp), @XMM[3] + je .Lecb_dec_four + movdqu 0x40($inp), @XMM[4] + cmp \$6,$len + jb .Lecb_dec_five + movdqu 0x50($inp), @XMM[5] + je .Lecb_dec_six + movdqu 0x60($inp), @XMM[6] + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_six: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_five: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_four: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_three: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_two: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_one: + call _bsaes_decrypt8 + movdqu @XMM[0], 0x00($out) # write output + jmp .Lecb_dec_done +.align 16 +.Lecb_dec_short: + lea ($inp), $arg1 + lea ($out), $arg2 + lea ($key), $arg3 + call asm_AES_decrypt + lea 16($inp), $inp + lea 16($out), $out + dec $len + jnz .Lecb_dec_short + +.Lecb_dec_done: + lea (%rsp),%rax + pxor %xmm0, %xmm0 +.Lecb_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + jb .Lecb_dec_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lecb_dec_epilogue: + ret +.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks +___ +} +$code.=<<___; +.extern asm_AES_cbc_encrypt +.globl bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,\@abi-omnipotent +.align 16 +bsaes_cbc_encrypt: +___ +$code.=<<___ if ($win64); + mov 48(%rsp),$arg6 # pull direction flag +___ +$code.=<<___; + cmp \$0,$arg6 + jne asm_AES_cbc_encrypt + cmp \$128,$arg3 + jb asm_AES_cbc_encrypt + + mov %rsp, %rax +.Lcbc_dec_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lcbc_dec_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov 240($arg4), %eax # rounds + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + mov $arg5, %rbx + shr \$4, $len # bytes to blocks + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp),%xmm7 # fix up 0 round key + movdqa %xmm6,(%rax) # save last round key + movdqa %xmm7,(%rsp) + + movdqu (%rbx), @XMM[15] # load IV + sub \$8,$len +.Lcbc_dec_loop: + movdqu 0x00($inp), @XMM[0] # load input + movdqu 0x10($inp), @XMM[1] + movdqu 0x20($inp), @XMM[2] + movdqu 0x30($inp), @XMM[3] + movdqu 0x40($inp), @XMM[4] + movdqu 0x50($inp), @XMM[5] + mov %rsp, %rax # pass key schedule + movdqu 0x60($inp), @XMM[6] + mov %edx,%r10d # pass rounds + movdqu 0x70($inp), @XMM[7] + movdqa @XMM[15], 0x20(%rbp) # put aside IV + + call _bsaes_decrypt8 + + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[12] + pxor @XMM[11], @XMM[2] + movdqu 0x50($inp), @XMM[13] + pxor @XMM[12], @XMM[7] + movdqu 0x60($inp), @XMM[14] + pxor @XMM[13], @XMM[3] + movdqu 0x70($inp), @XMM[15] # IV + pxor @XMM[14], @XMM[5] + movdqu @XMM[0], 0x00($out) # write output + lea 0x80($inp), $inp + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + sub \$8,$len + jnc .Lcbc_dec_loop + + add \$8,$len + jz .Lcbc_dec_done + + movdqu 0x00($inp), @XMM[0] # load input + mov %rsp, %rax # pass key schedule + mov %edx, %r10d # pass rounds + cmp \$2,$len + jb .Lcbc_dec_one + movdqu 0x10($inp), @XMM[1] + je .Lcbc_dec_two + movdqu 0x20($inp), @XMM[2] + cmp \$4,$len + jb .Lcbc_dec_three + movdqu 0x30($inp), @XMM[3] + je .Lcbc_dec_four + movdqu 0x40($inp), @XMM[4] + cmp \$6,$len + jb .Lcbc_dec_five + movdqu 0x50($inp), @XMM[5] + je .Lcbc_dec_six + movdqu 0x60($inp), @XMM[6] + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[12] + pxor @XMM[11], @XMM[2] + movdqu 0x50($inp), @XMM[13] + pxor @XMM[12], @XMM[7] + movdqu 0x60($inp), @XMM[15] # IV + pxor @XMM[13], @XMM[3] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_six: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[12] + pxor @XMM[11], @XMM[2] + movdqu 0x50($inp), @XMM[15] # IV + pxor @XMM[12], @XMM[7] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_five: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[11] + pxor @XMM[10], @XMM[4] + movdqu 0x40($inp), @XMM[15] # IV + pxor @XMM[11], @XMM[2] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_four: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[10] + pxor @XMM[9], @XMM[6] + movdqu 0x30($inp), @XMM[15] # IV + pxor @XMM[10], @XMM[4] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_three: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[9] + pxor @XMM[8], @XMM[1] + movdqu 0x20($inp), @XMM[15] # IV + pxor @XMM[9], @XMM[6] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_two: + movdqa @XMM[15], 0x20(%rbp) # put aside IV + call _bsaes_decrypt8 + pxor 0x20(%rbp), @XMM[0] # ^= IV + movdqu 0x00($inp), @XMM[8] # re-load input + movdqu 0x10($inp), @XMM[15] # IV + pxor @XMM[8], @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_one: + lea ($inp), $arg1 + lea 0x20(%rbp), $arg2 # buffer output + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[15] # ^= IV + movdqu @XMM[15], ($out) # write output + movdqa @XMM[0], @XMM[15] # IV + +.Lcbc_dec_done: + movdqu @XMM[15], (%rbx) # return IV + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lcbc_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lcbc_dec_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lcbc_dec_epilogue: + ret +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt + +.globl bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent +.align 16 +bsaes_ctr32_encrypt_blocks: + mov %rsp, %rax +.Lctr_enc_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lctr_enc_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + movdqu ($arg5), %xmm0 # load counter + mov 240($arg4), %eax # rounds + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + movdqa %xmm0, 0x20(%rbp) # copy counter + cmp \$8, $arg3 + jb .Lctr_enc_short + + mov %eax, %ebx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %ebx, %r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6,%xmm7 # fix up last round key + movdqa %xmm7,(%rax) # save last round key + + movdqa (%rsp), @XMM[9] # load round0 key + lea .LADD1(%rip), %r11 + movdqa 0x20(%rbp), @XMM[0] # counter copy + movdqa -0x20(%r11), @XMM[8] # .LSWPUP + pshufb @XMM[8], @XMM[9] # byte swap upper part + pshufb @XMM[8], @XMM[0] + movdqa @XMM[9], (%rsp) # save adjusted round0 key + jmp .Lctr_enc_loop +.align 16 +.Lctr_enc_loop: + movdqa @XMM[0], 0x20(%rbp) # save counter + movdqa @XMM[0], @XMM[1] # prepare 8 counter values + movdqa @XMM[0], @XMM[2] + paddd 0x00(%r11), @XMM[1] # .LADD1 + movdqa @XMM[0], @XMM[3] + paddd 0x10(%r11), @XMM[2] # .LADD2 + movdqa @XMM[0], @XMM[4] + paddd 0x20(%r11), @XMM[3] # .LADD3 + movdqa @XMM[0], @XMM[5] + paddd 0x30(%r11), @XMM[4] # .LADD4 + movdqa @XMM[0], @XMM[6] + paddd 0x40(%r11), @XMM[5] # .LADD5 + movdqa @XMM[0], @XMM[7] + paddd 0x50(%r11), @XMM[6] # .LADD6 + paddd 0x60(%r11), @XMM[7] # .LADD7 + + # Borrow prologue from _bsaes_encrypt8 to use the opportunity + # to flip byte order in 32-bit counter + movdqa (%rsp), @XMM[9] # round 0 key + lea 0x10(%rsp), %rax # pass key schedule + movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR + pxor @XMM[9], @XMM[0] # xor with round0 key + pxor @XMM[9], @XMM[1] + pshufb @XMM[8], @XMM[0] + pxor @XMM[9], @XMM[2] + pshufb @XMM[8], @XMM[1] + pxor @XMM[9], @XMM[3] + pshufb @XMM[8], @XMM[2] + pxor @XMM[9], @XMM[4] + pshufb @XMM[8], @XMM[3] + pxor @XMM[9], @XMM[5] + pshufb @XMM[8], @XMM[4] + pxor @XMM[9], @XMM[6] + pshufb @XMM[8], @XMM[5] + pxor @XMM[9], @XMM[7] + pshufb @XMM[8], @XMM[6] + lea .LBS0(%rip), %r11 # constants table + pshufb @XMM[8], @XMM[7] + mov %ebx,%r10d # pass rounds + + call _bsaes_encrypt8_bitslice + + sub \$8,$len + jc .Lctr_enc_loop_done + + movdqu 0x00($inp), @XMM[8] # load input + movdqu 0x10($inp), @XMM[9] + movdqu 0x20($inp), @XMM[10] + movdqu 0x30($inp), @XMM[11] + movdqu 0x40($inp), @XMM[12] + movdqu 0x50($inp), @XMM[13] + movdqu 0x60($inp), @XMM[14] + movdqu 0x70($inp), @XMM[15] + lea 0x80($inp),$inp + pxor @XMM[0], @XMM[8] + movdqa 0x20(%rbp), @XMM[0] # load counter + pxor @XMM[9], @XMM[1] + movdqu @XMM[8], 0x00($out) # write output + pxor @XMM[10], @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor @XMM[11], @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor @XMM[12], @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor @XMM[13], @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor @XMM[14], @XMM[2] + movdqu @XMM[7], 0x50($out) + pxor @XMM[15], @XMM[5] + movdqu @XMM[2], 0x60($out) + lea .LADD1(%rip), %r11 + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + paddd 0x70(%r11), @XMM[0] # .LADD8 + jnz .Lctr_enc_loop + + jmp .Lctr_enc_done +.align 16 +.Lctr_enc_loop_done: + add \$8, $len + movdqu 0x00($inp), @XMM[8] # load input + pxor @XMM[8], @XMM[0] + movdqu @XMM[0], 0x00($out) # write output + cmp \$2,$len + jb .Lctr_enc_done + movdqu 0x10($inp), @XMM[9] + pxor @XMM[9], @XMM[1] + movdqu @XMM[1], 0x10($out) + je .Lctr_enc_done + movdqu 0x20($inp), @XMM[10] + pxor @XMM[10], @XMM[4] + movdqu @XMM[4], 0x20($out) + cmp \$4,$len + jb .Lctr_enc_done + movdqu 0x30($inp), @XMM[11] + pxor @XMM[11], @XMM[6] + movdqu @XMM[6], 0x30($out) + je .Lctr_enc_done + movdqu 0x40($inp), @XMM[12] + pxor @XMM[12], @XMM[3] + movdqu @XMM[3], 0x40($out) + cmp \$6,$len + jb .Lctr_enc_done + movdqu 0x50($inp), @XMM[13] + pxor @XMM[13], @XMM[7] + movdqu @XMM[7], 0x50($out) + je .Lctr_enc_done + movdqu 0x60($inp), @XMM[14] + pxor @XMM[14], @XMM[2] + movdqu @XMM[2], 0x60($out) + jmp .Lctr_enc_done + +.align 16 +.Lctr_enc_short: + lea 0x20(%rbp), $arg1 + lea 0x30(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt + movdqu ($inp), @XMM[1] + lea 16($inp), $inp + mov 0x2c(%rbp), %eax # load 32-bit counter + bswap %eax + pxor 0x30(%rbp), @XMM[1] + inc %eax # increment + movdqu @XMM[1], ($out) + bswap %eax + lea 16($out), $out + mov %eax, 0x2c(%rsp) # save 32-bit counter + dec $len + jnz .Lctr_enc_short + +.Lctr_enc_done: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lctr_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lctr_enc_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lctr_enc_epilogue: + ret +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +___ +###################################################################### +# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len, +# const AES_KEY *key1, const AES_KEY *key2, +# const unsigned char iv[16]); +# +my ($twmask,$twres,$twtmp)=@XMM[13..15]; +$code.=<<___; +.globl bsaes_xts_encrypt +.type bsaes_xts_encrypt,\@abi-omnipotent +.align 16 +bsaes_xts_encrypt: + mov %rsp, %rax +.Lxts_enc_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull key2 + mov 0xa8(%rsp),$arg6 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lxts_enc_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + + lea ($arg6), $arg1 + lea 0x20(%rbp), $arg2 + lea ($arg5), $arg3 + call asm_AES_encrypt # generate initial tweak + + mov 240($key), %eax # rounds + mov $len, %rbx # backup $len + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor %xmm6, %xmm7 # fix up last round key + movdqa %xmm7, (%rax) # save last round key + + and \$-16, $len + sub \$0x80, %rsp # place for tweak[8] + movdqa 0x20(%rbp), @XMM[7] # initial tweak + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + + sub \$0x80, $len + jc .Lxts_enc_short + jmp .Lxts_enc_loop + +.align 16 +.Lxts_enc_loop: +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqu 0x70($inp), @XMM[8+7] + lea 0x80($inp), $inp + movdqa @XMM[7], 0x70(%rsp) + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + pxor @XMM[8+7], @XMM[7] + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor 0x60(%rsp), @XMM[2] + movdqu @XMM[7], 0x50($out) + pxor 0x70(%rsp), @XMM[5] + movdqu @XMM[2], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + + movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] + + sub \$0x80,$len + jnc .Lxts_enc_loop + +.Lxts_enc_short: + add \$0x80, $len + jz .Lxts_enc_done +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] + cmp \$`0x10*$i`,$len + je .Lxts_enc_$i +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqa @XMM[7], 0x70(%rsp) + lea 0x70($inp), $inp + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + pxor 0x60(%rsp), @XMM[2] + movdqu @XMM[7], 0x50($out) + movdqu @XMM[2], 0x60($out) + lea 0x70($out), $out + + movdqa 0x70(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_6: + pxor @XMM[8+4], @XMM[4] + lea 0x60($inp), $inp + pxor @XMM[8+5], @XMM[5] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[3], 0x40($out) + movdqu @XMM[7], 0x50($out) + lea 0x60($out), $out + + movdqa 0x60(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_5: + pxor @XMM[8+3], @XMM[3] + lea 0x50($inp), $inp + pxor @XMM[8+4], @XMM[4] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + pxor 0x40(%rsp), @XMM[3] + movdqu @XMM[6], 0x30($out) + movdqu @XMM[3], 0x40($out) + lea 0x50($out), $out + + movdqa 0x50(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_4: + pxor @XMM[8+2], @XMM[2] + lea 0x40($inp), $inp + pxor @XMM[8+3], @XMM[3] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[6] + movdqu @XMM[4], 0x20($out) + movdqu @XMM[6], 0x30($out) + lea 0x40($out), $out + + movdqa 0x40(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_3: + pxor @XMM[8+1], @XMM[1] + lea 0x30($inp), $inp + pxor @XMM[8+2], @XMM[2] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[4] + movdqu @XMM[1], 0x10($out) + movdqu @XMM[4], 0x20($out) + lea 0x30($out), $out + + movdqa 0x30(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_2: + pxor @XMM[8+0], @XMM[0] + lea 0x20($inp), $inp + pxor @XMM[8+1], @XMM[1] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_encrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + lea 0x20($out), $out + + movdqa 0x20(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_enc_done +.align 16 +.Lxts_enc_1: + pxor @XMM[0], @XMM[8] + lea 0x10($inp), $inp + movdqa @XMM[8], 0x20(%rbp) + lea 0x20(%rbp), $arg1 + lea 0x20(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_encrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[0] # ^= tweak[] + #pxor @XMM[8], @XMM[0] + #lea 0x80(%rsp), %rax # pass key schedule + #mov %edx, %r10d # pass rounds + #call _bsaes_encrypt8 + #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + movdqu @XMM[0], 0x00($out) # write output + lea 0x10($out), $out + + movdqa 0x10(%rsp), @XMM[7] # next iteration tweak + +.Lxts_enc_done: + and \$15, %ebx + jz .Lxts_enc_ret + mov $out, %rdx + +.Lxts_enc_steal: + movzb ($inp), %eax + movzb -16(%rdx), %ecx + lea 1($inp), $inp + mov %al, -16(%rdx) + mov %cl, 0(%rdx) + lea 1(%rdx), %rdx + sub \$1,%ebx + jnz .Lxts_enc_steal + + movdqu -16($out), @XMM[0] + lea 0x20(%rbp), $arg1 + pxor @XMM[7], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_encrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[7] + movdqu @XMM[7], -16($out) + +.Lxts_enc_ret: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lxts_enc_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lxts_enc_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lxts_enc_epilogue: + ret +.size bsaes_xts_encrypt,.-bsaes_xts_encrypt + +.globl bsaes_xts_decrypt +.type bsaes_xts_decrypt,\@abi-omnipotent +.align 16 +bsaes_xts_decrypt: + mov %rsp, %rax +.Lxts_dec_prologue: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + lea -0x48(%rsp), %rsp +___ +$code.=<<___ if ($win64); + mov 0xa0(%rsp),$arg5 # pull key2 + mov 0xa8(%rsp),$arg6 # pull ivp + lea -0xa0(%rsp), %rsp + movaps %xmm6, 0x40(%rsp) + movaps %xmm7, 0x50(%rsp) + movaps %xmm8, 0x60(%rsp) + movaps %xmm9, 0x70(%rsp) + movaps %xmm10, 0x80(%rsp) + movaps %xmm11, 0x90(%rsp) + movaps %xmm12, 0xa0(%rsp) + movaps %xmm13, 0xb0(%rsp) + movaps %xmm14, 0xc0(%rsp) + movaps %xmm15, 0xd0(%rsp) +.Lxts_dec_body: +___ +$code.=<<___; + mov %rsp, %rbp # backup %rsp + mov $arg1, $inp # backup arguments + mov $arg2, $out + mov $arg3, $len + mov $arg4, $key + + lea ($arg6), $arg1 + lea 0x20(%rbp), $arg2 + lea ($arg5), $arg3 + call asm_AES_encrypt # generate initial tweak + + mov 240($key), %eax # rounds + mov $len, %rbx # backup $len + + mov %eax, %edx # rounds + shl \$7, %rax # 128 bytes per inner round key + sub \$`128-32`, %rax # size of bit-sliced key schedule + sub %rax, %rsp + + mov %rsp, %rax # pass key schedule + mov $key, %rcx # pass key + mov %edx, %r10d # pass rounds + call _bsaes_key_convert + pxor (%rsp), %xmm7 # fix up round 0 key + movdqa %xmm6, (%rax) # save last round key + movdqa %xmm7, (%rsp) + + xor %eax, %eax # if ($len%16) len-=16; + and \$-16, $len + test \$15, %ebx + setnz %al + shl \$4, %rax + sub %rax, $len + + sub \$0x80, %rsp # place for tweak[8] + movdqa 0x20(%rbp), @XMM[7] # initial tweak + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + + sub \$0x80, $len + jc .Lxts_dec_short + jmp .Lxts_dec_loop + +.align 16 +.Lxts_dec_loop: +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqu 0x70($inp), @XMM[8+7] + lea 0x80($inp), $inp + movdqa @XMM[7], 0x70(%rsp) + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + pxor @XMM[8+7], @XMM[7] + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + pxor 0x60(%rsp), @XMM[3] + movdqu @XMM[7], 0x50($out) + pxor 0x70(%rsp), @XMM[5] + movdqu @XMM[3], 0x60($out) + movdqu @XMM[5], 0x70($out) + lea 0x80($out), $out + + movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] + + sub \$0x80,$len + jnc .Lxts_dec_loop + +.Lxts_dec_short: + add \$0x80, $len + jz .Lxts_dec_done +___ + for ($i=0;$i<7;$i++) { + $code.=<<___; + pshufd \$0x13, $twtmp, $twres + pxor $twtmp, $twtmp + movdqa @XMM[7], @XMM[$i] + movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + pcmpgtd @XMM[7], $twtmp # broadcast upper bits + pxor $twres, @XMM[7] +___ + $code.=<<___ if ($i>=1); + movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1] + cmp \$`0x10*$i`,$len + je .Lxts_dec_$i +___ + $code.=<<___ if ($i>=2); + pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[] +___ + } +$code.=<<___; + movdqu 0x60($inp), @XMM[8+6] + pxor @XMM[8+5], @XMM[5] + movdqa @XMM[7], 0x70(%rsp) + lea 0x70($inp), $inp + pxor @XMM[8+6], @XMM[6] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + pxor 0x60(%rsp), @XMM[3] + movdqu @XMM[7], 0x50($out) + movdqu @XMM[3], 0x60($out) + lea 0x70($out), $out + + movdqa 0x70(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_6: + pxor @XMM[8+4], @XMM[4] + lea 0x60($inp), $inp + pxor @XMM[8+5], @XMM[5] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + pxor 0x50(%rsp), @XMM[7] + movdqu @XMM[2], 0x40($out) + movdqu @XMM[7], 0x50($out) + lea 0x60($out), $out + + movdqa 0x60(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_5: + pxor @XMM[8+3], @XMM[3] + lea 0x50($inp), $inp + pxor @XMM[8+4], @XMM[4] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + pxor 0x40(%rsp), @XMM[2] + movdqu @XMM[4], 0x30($out) + movdqu @XMM[2], 0x40($out) + lea 0x50($out), $out + + movdqa 0x50(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_4: + pxor @XMM[8+2], @XMM[2] + lea 0x40($inp), $inp + pxor @XMM[8+3], @XMM[3] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + pxor 0x30(%rsp), @XMM[4] + movdqu @XMM[6], 0x20($out) + movdqu @XMM[4], 0x30($out) + lea 0x40($out), $out + + movdqa 0x40(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_3: + pxor @XMM[8+1], @XMM[1] + lea 0x30($inp), $inp + pxor @XMM[8+2], @XMM[2] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + pxor 0x20(%rsp), @XMM[6] + movdqu @XMM[1], 0x10($out) + movdqu @XMM[6], 0x20($out) + lea 0x30($out), $out + + movdqa 0x30(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_2: + pxor @XMM[8+0], @XMM[0] + lea 0x20($inp), $inp + pxor @XMM[8+1], @XMM[1] + lea 0x80(%rsp), %rax # pass key schedule + mov %edx, %r10d # pass rounds + + call _bsaes_decrypt8 + + pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + pxor 0x10(%rsp), @XMM[1] + movdqu @XMM[0], 0x00($out) # write output + movdqu @XMM[1], 0x10($out) + lea 0x20($out), $out + + movdqa 0x20(%rsp), @XMM[7] # next iteration tweak + jmp .Lxts_dec_done +.align 16 +.Lxts_dec_1: + pxor @XMM[0], @XMM[8] + lea 0x10($inp), $inp + movdqa @XMM[8], 0x20(%rbp) + lea 0x20(%rbp), $arg1 + lea 0x20(%rbp), $arg2 + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[0] # ^= tweak[] + #pxor @XMM[8], @XMM[0] + #lea 0x80(%rsp), %rax # pass key schedule + #mov %edx, %r10d # pass rounds + #call _bsaes_decrypt8 + #pxor 0x00(%rsp), @XMM[0] # ^= tweak[] + movdqu @XMM[0], 0x00($out) # write output + lea 0x10($out), $out + + movdqa 0x10(%rsp), @XMM[7] # next iteration tweak + +.Lxts_dec_done: + and \$15, %ebx + jz .Lxts_dec_ret + + pxor $twtmp, $twtmp + movdqa .Lxts_magic(%rip), $twmask + pcmpgtd @XMM[7], $twtmp + pshufd \$0x13, $twtmp, $twres + movdqa @XMM[7], @XMM[6] + paddq @XMM[7], @XMM[7] # psllq 1,$tweak + pand $twmask, $twres # isolate carry and residue + movdqu ($inp), @XMM[0] + pxor $twres, @XMM[7] + + lea 0x20(%rbp), $arg1 + pxor @XMM[7], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[7] + mov $out, %rdx + movdqu @XMM[7], ($out) + +.Lxts_dec_steal: + movzb 16($inp), %eax + movzb (%rdx), %ecx + lea 1($inp), $inp + mov %al, (%rdx) + mov %cl, 16(%rdx) + lea 1(%rdx), %rdx + sub \$1,%ebx + jnz .Lxts_dec_steal + + movdqu ($out), @XMM[0] + lea 0x20(%rbp), $arg1 + pxor @XMM[6], @XMM[0] + lea 0x20(%rbp), $arg2 + movdqa @XMM[0], 0x20(%rbp) + lea ($key), $arg3 + call asm_AES_decrypt # doesn't touch %xmm + pxor 0x20(%rbp), @XMM[6] + movdqu @XMM[6], ($out) + +.Lxts_dec_ret: + lea (%rsp), %rax + pxor %xmm0, %xmm0 +.Lxts_dec_bzero: # wipe key schedule [if any] + movdqa %xmm0, 0x00(%rax) + movdqa %xmm0, 0x10(%rax) + lea 0x20(%rax), %rax + cmp %rax, %rbp + ja .Lxts_dec_bzero + + lea (%rbp),%rsp # restore %rsp +___ +$code.=<<___ if ($win64); + movaps 0x40(%rbp), %xmm6 + movaps 0x50(%rbp), %xmm7 + movaps 0x60(%rbp), %xmm8 + movaps 0x70(%rbp), %xmm9 + movaps 0x80(%rbp), %xmm10 + movaps 0x90(%rbp), %xmm11 + movaps 0xa0(%rbp), %xmm12 + movaps 0xb0(%rbp), %xmm13 + movaps 0xc0(%rbp), %xmm14 + movaps 0xd0(%rbp), %xmm15 + lea 0xa0(%rbp), %rsp +___ +$code.=<<___; + mov 0x48(%rsp), %r15 + mov 0x50(%rsp), %r14 + mov 0x58(%rsp), %r13 + mov 0x60(%rsp), %r12 + mov 0x68(%rsp), %rbx + mov 0x70(%rsp), %rax + lea 0x78(%rsp), %rsp + mov %rax, %rbp +.Lxts_dec_epilogue: + ret +.size bsaes_xts_decrypt,.-bsaes_xts_decrypt +___ +} +$code.=<<___; +.type _bsaes_const,\@object +.align 64 +_bsaes_const: +.LM0ISR: # InvShiftRows constants + .quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISRM0: + .quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LISR: + .quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LBS0: # bit-slice constants + .quad 0x5555555555555555, 0x5555555555555555 +.LBS1: + .quad 0x3333333333333333, 0x3333333333333333 +.LBS2: + .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.LSR: # shiftrows constants + .quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: + .quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0SR: + .quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSWPUP: # byte-swap upper dword + .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 +.LSWPUPM0SR: + .quad 0x0a0d02060c03070b, 0x0004080f05090e01 +.LADD1: # counter increment constants + .quad 0x0000000000000000, 0x0000000100000000 +.LADD2: + .quad 0x0000000000000000, 0x0000000200000000 +.LADD3: + .quad 0x0000000000000000, 0x0000000300000000 +.LADD4: + .quad 0x0000000000000000, 0x0000000400000000 +.LADD5: + .quad 0x0000000000000000, 0x0000000500000000 +.LADD6: + .quad 0x0000000000000000, 0x0000000600000000 +.LADD7: + .quad 0x0000000000000000, 0x0000000700000000 +.LADD8: + .quad 0x0000000000000000, 0x0000000800000000 +.Lxts_magic: + .long 0x87,0,1,0 +.Lmasks: + .quad 0x0101010101010101, 0x0101010101010101 + .quad 0x0202020202020202, 0x0202020202020202 + .quad 0x0404040404040404, 0x0404040404040404 + .quad 0x0808080808080808, 0x0808080808080808 +.LM0: + .quad 0x02060a0e03070b0f, 0x0004080c0105090d +.L63: + .quad 0x6363636363636363, 0x6363636363636363 +.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov" +.align 64 +.size _bsaes_const,.-_bsaes_const +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov 160($context),%rax # pull context->Rbp + + lea 0x40(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xa0(%rax),%rax # adjust stack pointer + + mov 0x70(%rax),%rbp + mov 0x68(%rax),%rbx + mov 0x60(%rax),%r12 + mov 0x58(%rax),%r13 + mov 0x50(%rax),%r14 + mov 0x48(%rax),%r15 + lea 0x78(%rax),%rax # adjust stack pointer + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov %rax,152($context) # restore context->Rsp + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 +___ +$code.=<<___ if ($ecb); + .rva .Lecb_enc_prologue + .rva .Lecb_enc_epilogue + .rva .Lecb_enc_info + + .rva .Lecb_dec_prologue + .rva .Lecb_dec_epilogue + .rva .Lecb_dec_info +___ +$code.=<<___; + .rva .Lcbc_dec_prologue + .rva .Lcbc_dec_epilogue + .rva .Lcbc_dec_info + + .rva .Lctr_enc_prologue + .rva .Lctr_enc_epilogue + .rva .Lctr_enc_info + + .rva .Lxts_enc_prologue + .rva .Lxts_enc_epilogue + .rva .Lxts_enc_info + + .rva .Lxts_dec_prologue + .rva .Lxts_dec_epilogue + .rva .Lxts_dec_info + +.section .xdata +.align 8 +___ +$code.=<<___ if ($ecb); +.Lecb_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[] +.Lecb_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[] +___ +$code.=<<___; +.Lcbc_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[] +.Lctr_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[] +.Lxts_enc_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[] +.Lxts_dec_info: + .byte 9,0,0,0 + .rva se_handler + .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86.pl b/src/lib/libcrypto/aes/asm/vpaes-x86.pl new file mode 100644 index 0000000000..1533e2c304 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/vpaes-x86.pl @@ -0,0 +1,903 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for +# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-586.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86.pl column - [also +# large-block CBC] encrypt/decrypt. +# +# aes-586.pl vpaes-x86.pl +# +# Core 2(**) 29.1/42.3/18.3 22.0/25.6(***) +# Nehalem 27.9/40.4/18.1 10.3/12.0 +# Atom 102./119./60.1 64.5/85.3(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +32%/65% improvement on Core 2 +# and +58%/40% on Atom (as implied, over "hyper-threading-safe" +# code path). +# +# + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$PREFIX="vpaes"; + +my ($round, $base, $magic, $key, $const, $inp, $out)= + ("eax", "ebx", "ecx", "edx","ebp", "esi","edi"); + +&static_label("_vpaes_consts"); +&static_label("_vpaes_schedule_low_round"); + +&set_label("_vpaes_consts",64); +$k_inv=-0x30; # inv, inva + &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309); + &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C); + +$k_s0F=-0x10; # s0F + &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F); + +$k_ipt=0x00; # input transform (lo, hi) + &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090); + &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC); + +$k_sb1=0x20; # sb1u, sb1t + &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E); + &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1); +$k_sb2=0x40; # sb2u, sb2t + &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955); + &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8); +$k_sbo=0x60; # sbou, sbot + &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A); + &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1); + +$k_mc_forward=0x80; # mc_forward + &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D); + &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201); + &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605); + &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09); + +$k_mc_backward=0xc0; # mc_backward + &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F); + &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B); + &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407); + &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003); + +$k_sr=0x100; # sr + &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C); + &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C); + &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C); + &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C); + +$k_rcon=0x140; # rcon + &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808); + +$k_s63=0x150; # s63: all equal to 0x63 transformed + &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B); + +$k_opt=0x160; # output transform + &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121); + &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1); + +$k_deskew=0x180; # deskew tables: inverts the sbox's "skew" + &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A); + &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB); +## +## Decryption stuff +## Key schedule constants +## +$k_dksd=0x1a0; # decryption key schedule: invskew x*D + &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4); + &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA); +$k_dksb=0x1c0; # decryption key schedule: invskew x*B + &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386); + &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F); +$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63 + &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C); + &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A); +$k_dks9=0x200; # decryption key schedule: invskew x*9 + &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334); + &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC); + +## +## Decryption stuff +## Round function constants +## +$k_dipt=0x220; # decryption input transform + &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E); + &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772); + +$k_dsb9=0x240; # decryption sbox output *9*u, *9*t + &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50); + &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E); +$k_dsbd=0x260; # decryption sbox output *D*u, *D*t + &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13); + &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D); +$k_dsbb=0x280; # decryption sbox output *B*u, *B*t + &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6); + &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E); +$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t + &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004); + &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B); +$k_dsbo=0x2c0; # decryption sbox final output + &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9); + &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159); +&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)"); +&align (64); + +&function_begin_B("_vpaes_preheat"); + &add ($const,&DWP(0,"esp")); + &movdqa ("xmm7",&QWP($k_inv,$const)); + &movdqa ("xmm6",&QWP($k_s0F,$const)); + &ret (); +&function_end_B("_vpaes_preheat"); + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm6-%xmm7 as in _vpaes_preheat +## (%edx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx +## +## +&function_begin_B("_vpaes_encrypt_core"); + &mov ($magic,16); + &mov ($round,&DWP(240,$key)); + &movdqa ("xmm1","xmm6") + &movdqa ("xmm2",&QWP($k_ipt,$const)); + &pandn ("xmm1","xmm0"); + &movdqu ("xmm5",&QWP(0,$key)); + &psrld ("xmm1",4); + &pand ("xmm0","xmm6"); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_ipt+16,$const)); + &pshufb ("xmm0","xmm1"); + &pxor ("xmm2","xmm5"); + &pxor ("xmm0","xmm2"); + &add ($key,16); + &lea ($base,&DWP($k_mc_backward,$const)); + &jmp (&label("enc_entry")); + + +&set_label("enc_loop",16); + # middle of middle round + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u + &pshufb ("xmm4","xmm2"); # 4 = sb1u + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = A + &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u + &pshufb ("xmm5","xmm2"); # 4 = sb2u + &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[] + &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t + &pshufb ("xmm2","xmm3"); # 2 = sb2t + &pxor ("xmm2","xmm5"); # 2 = 2A + &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[] + &movdqa ("xmm3","xmm0"); # 3 = A + &pshufb ("xmm0","xmm1"); # 0 = B + &add ($key,16); # next key + &pxor ("xmm0","xmm2"); # 0 = 2A+B + &pshufb ("xmm3","xmm4"); # 3 = D + &add ($magic,16); # next mc + &pxor ("xmm3","xmm0"); # 3 = 2A+B+D + &pshufb ("xmm0","xmm1"); # 0 = 2B+C + &and ($magic,0x30); # ... mod 4 + &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D + &sub ($round,1); # nr-- + +&set_label("enc_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm6"); # 0 = k + &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm5","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &movdqu ("xmm5",&QWP(0,$key)); + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &jnz (&label("enc_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo + &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16 + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm5"); # 4 = sb1u + k + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[] + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm1"); + &ret (); +&function_end_B("_vpaes_encrypt_core"); + +## +## Decryption core +## +## Same API as encryption core. +## +&function_begin_B("_vpaes_decrypt_core"); + &mov ($round,&DWP(240,$key)); + &lea ($base,&DWP($k_dsbd,$const)); + &movdqa ("xmm1","xmm6"); + &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base)); + &pandn ("xmm1","xmm0"); + &mov ($magic,$round); + &psrld ("xmm1",4) + &movdqu ("xmm5",&QWP(0,$key)); + &shl ($magic,4); + &pand ("xmm0","xmm6"); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base)); + &xor ($magic,0x30); + &pshufb ("xmm0","xmm1"); + &and ($magic,0x30); + &pxor ("xmm2","xmm5"); + &movdqa ("xmm5",&QWP($k_mc_forward+48,$const)); + &pxor ("xmm0","xmm2"); + &add ($key,16); + &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic)); + &jmp (&label("dec_entry")); + +&set_label("dec_loop",16); +## +## Inverse mix columns +## + &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u + &pshufb ("xmm4","xmm2"); # 4 = sb9u + &pxor ("xmm4","xmm0"); + &movdqa ("xmm0",&QWP(-0x10,$base)); # 0 : sb9t + &pshufb ("xmm0","xmm3"); # 0 = sb9t + &pxor ("xmm0","xmm4"); # 0 = ch + &add ($key,16); # next round key + + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu + &pshufb ("xmm4","xmm2"); # 4 = sbdu + &pxor ("xmm4","xmm0"); # 4 = ch + &movdqa ("xmm0",&QWP(0x10,$base)); # 0 : sbdt + &pshufb ("xmm0","xmm3"); # 0 = sbdt + &pxor ("xmm0","xmm4"); # 0 = ch + &sub ($round,1); # nr-- + + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu + &pshufb ("xmm4","xmm2"); # 4 = sbbu + &pxor ("xmm4","xmm0"); # 4 = ch + &movdqa ("xmm0",&QWP(0x30,$base)); # 0 : sbbt + &pshufb ("xmm0","xmm3"); # 0 = sbbt + &pxor ("xmm0","xmm4"); # 0 = ch + + &pshufb ("xmm0","xmm5"); # MC ch + &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu + &pshufb ("xmm4","xmm2"); # 4 = sbeu + &pxor ("xmm4","xmm0"); # 4 = ch + &movdqa ("xmm0",&QWP(0x50,$base)); # 0 : sbet + &pshufb ("xmm0","xmm3"); # 0 = sbet + &pxor ("xmm0","xmm4"); # 0 = ch + + &palignr("xmm5","xmm5",12); + +&set_label("dec_entry"); + # top of round + &movdqa ("xmm1","xmm6"); # 1 : i + &pandn ("xmm1","xmm0"); # 1 = i<<4 + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm6"); # 0 = k + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm2","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm7"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm7"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm7"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm7"); # 3 : 1/jak + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &movdqu ("xmm0",&QWP(0,$key)); + &jnz (&label("dec_loop")); + + # middle of last round + &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &pxor ("xmm4","xmm0"); # 4 = sb1u + k + &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot + &movdqa ("xmm2",&QWP(0,$magic)); + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = A + &pshufb ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_decrypt_core"); + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +&function_begin_B("_vpaes_schedule_core"); + &add ($const,&DWP(0,"esp")); + &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned) + &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon + + # input transform + &movdqa ("xmm3","xmm0"); + &lea ($base,&DWP($k_ipt,$const)); + &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8 + &call ("_vpaes_schedule_transform"); + &movdqa ("xmm7","xmm0"); + + &test ($out,$out); + &jnz (&label("schedule_am_decrypting")); + + # encrypting, output zeroth round key after transform + &movdqu (&QWP(0,$key),"xmm0"); + &jmp (&label("schedule_go")); + +&set_label("schedule_am_decrypting"); + # decrypting, output zeroth round key after shiftrows + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &movdqu (&QWP(0,$key),"xmm3"); + &xor ($magic,0x30); + +&set_label("schedule_go"); + &cmp ($round,192); + &ja (&label("schedule_256")); + &je (&label("schedule_192")); + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +&set_label("schedule_128"); + &mov ($round,10); + +&set_label("loop_schedule_128"); + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # write output + &jmp (&label("loop_schedule_128")); + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +&set_label("schedule_192",16); + &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &movdqa ("xmm6","xmm0"); # save short part + &pxor ("xmm4","xmm4"); # clear 4 + &movhlps("xmm6","xmm4"); # clobber low side with zeros + &mov ($round,4); + +&set_label("loop_schedule_192"); + &call ("_vpaes_schedule_round"); + &palignr("xmm0","xmm6",8); + &call ("_vpaes_schedule_mangle"); # save key n + &call ("_vpaes_schedule_192_smear"); + &call ("_vpaes_schedule_mangle"); # save key n+1 + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); # save key n+2 + &call ("_vpaes_schedule_192_smear"); + &jmp (&label("loop_schedule_192")); + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +&set_label("schedule_256",16); + &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned) + &call ("_vpaes_schedule_transform"); # input transform + &mov ($round,7); + +&set_label("loop_schedule_256"); + &call ("_vpaes_schedule_mangle"); # output low result + &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6 + + # high round + &call ("_vpaes_schedule_round"); + &dec ($round); + &jz (&label("schedule_mangle_last")); + &call ("_vpaes_schedule_mangle"); + + # low round. swap xmm7 and xmm6 + &pshufd ("xmm0","xmm0",0xFF); + &movdqa (&QWP(20,"esp"),"xmm7"); + &movdqa ("xmm7","xmm6"); + &call ("_vpaes_schedule_low_round"); + &movdqa ("xmm7",&QWP(20,"esp")); + + &jmp (&label("loop_schedule_256")); + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +&set_label("schedule_mangle_last",16); + # schedule last round key from xmm0 + &lea ($base,&DWP($k_deskew,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_last_dec")); + + # encrypting + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm0","xmm1"); # output permute + &lea ($base,&DWP($k_opt,$const)); # prepare to output transform + &add ($key,32); + +&set_label("schedule_mangle_last_dec"); + &add ($key,-16); + &pxor ("xmm0",&QWP($k_s63,$const)); + &call ("_vpaes_schedule_transform"); # output transform + &movdqu (&QWP(0,$key),"xmm0"); # save last key + + # cleanup + &pxor ("xmm0","xmm0"); + &pxor ("xmm1","xmm1"); + &pxor ("xmm2","xmm2"); + &pxor ("xmm3","xmm3"); + &pxor ("xmm4","xmm4"); + &pxor ("xmm5","xmm5"); + &pxor ("xmm6","xmm6"); + &pxor ("xmm7","xmm7"); + &ret (); +&function_end_B("_vpaes_schedule_core"); + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +&function_begin_B("_vpaes_schedule_192_smear"); + &pshufd ("xmm0","xmm6",0x80); # d c 0 0 -> c 0 0 0 + &pxor ("xmm6","xmm0"); # -> c+d c 0 0 + &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a + &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a + &movdqa ("xmm0","xmm6"); + &pxor ("xmm1","xmm1"); + &movhlps("xmm6","xmm1"); # clobber low side with zeros + &ret (); +&function_end_B("_vpaes_schedule_192_smear"); + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm5. +## +&function_begin_B("_vpaes_schedule_round"); + # extract rcon from xmm8 + &movdqa ("xmm2",&QWP(8,"esp")); # xmm8 + &pxor ("xmm1","xmm1"); + &palignr("xmm1","xmm2",15); + &palignr("xmm2","xmm2",15); + &pxor ("xmm7","xmm1"); + + # rotate + &pshufd ("xmm0","xmm0",0xFF); + &palignr("xmm0","xmm0",1); + + # fall through... + &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8 + + # low round: same as high round, but no rotation and no rcon. +&set_label("_vpaes_schedule_low_round"); + # smear xmm7 + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",4); + &pxor ("xmm7","xmm1"); + &movdqa ("xmm1","xmm7"); + &pslldq ("xmm7",8); + &pxor ("xmm7","xmm1"); + &pxor ("xmm7",&QWP($k_s63,$const)); + + # subbyte + &movdqa ("xmm4",&QWP($k_s0F,$const)); + &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j + &movdqa ("xmm1","xmm4"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); # 1 = i + &pand ("xmm0","xmm4"); # 0 = k + &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k + &pshufb ("xmm2","xmm0"); # 2 = a/k + &pxor ("xmm0","xmm1"); # 0 = j + &movdqa ("xmm3","xmm5"); # 3 : 1/i + &pshufb ("xmm3","xmm1"); # 3 = 1/i + &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k + &movdqa ("xmm4","xmm5"); # 4 : 1/j + &pshufb ("xmm4","xmm0"); # 4 = 1/j + &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k + &movdqa ("xmm2","xmm5"); # 2 : 1/iak + &pshufb ("xmm2","xmm3"); # 2 = 1/iak + &pxor ("xmm2","xmm0"); # 2 = io + &movdqa ("xmm3","xmm5"); # 3 : 1/jak + &pshufb ("xmm3","xmm4"); # 3 = 1/jak + &pxor ("xmm3","xmm1"); # 3 = jo + &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou + &pshufb ("xmm4","xmm2"); # 4 = sbou + &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot + &pshufb ("xmm0","xmm3"); # 0 = sb1t + &pxor ("xmm0","xmm4"); # 0 = sbox output + + # add in smeared stuff + &pxor ("xmm0","xmm7"); + &movdqa ("xmm7","xmm0"); + &ret (); +&function_end_B("_vpaes_schedule_round"); + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%ebx) +## +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +&function_begin_B("_vpaes_schedule_transform"); + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm0"); + &psrld ("xmm1",4); + &pand ("xmm0","xmm2"); + &movdqa ("xmm2",&QWP(0,$base)); + &pshufb ("xmm2","xmm0"); + &movdqa ("xmm0",&QWP(16,$base)); + &pshufb ("xmm0","xmm1"); + &pxor ("xmm0","xmm2"); + &ret (); +&function_end_B("_vpaes_schedule_transform"); + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%edx), and increments or decrements it +## Keeps track of round number mod 4 in %ecx +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +&function_begin_B("_vpaes_schedule_mangle"); + &movdqa ("xmm4","xmm0"); # save xmm0 for later + &movdqa ("xmm5",&QWP($k_mc_forward,$const)); + &test ($out,$out); + &jnz (&label("schedule_mangle_dec")); + + # encrypting + &add ($key,16); + &pxor ("xmm4",&QWP($k_s63,$const)); + &pshufb ("xmm4","xmm5"); + &movdqa ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + &pshufb ("xmm4","xmm5"); + &pxor ("xmm3","xmm4"); + + &jmp (&label("schedule_mangle_both")); + +&set_label("schedule_mangle_dec",16); + # inverse mix columns + &movdqa ("xmm2",&QWP($k_s0F,$const)); + &lea ($inp,&DWP($k_dksd,$const)); + &movdqa ("xmm1","xmm2"); + &pandn ("xmm1","xmm4"); + &psrld ("xmm1",4); # 1 = hi + &pand ("xmm4","xmm2"); # 4 = lo + + &movdqa ("xmm2",&QWP(0,$inp)); + &pshufb ("xmm2","xmm4"); + &movdqa ("xmm3",&QWP(0x10,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x20,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x30,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x40,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x50,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + &pshufb ("xmm3","xmm5"); + + &movdqa ("xmm2",&QWP(0x60,$inp)); + &pshufb ("xmm2","xmm4"); + &pxor ("xmm2","xmm3"); + &movdqa ("xmm3",&QWP(0x70,$inp)); + &pshufb ("xmm3","xmm1"); + &pxor ("xmm3","xmm2"); + + &add ($key,-16); + +&set_label("schedule_mangle_both"); + &movdqa ("xmm1",&QWP($k_sr,$const,$magic)); + &pshufb ("xmm3","xmm1"); + &add ($magic,-16); + &and ($magic,0x30); + &movdqu (&QWP(0,$key),"xmm3"); + &ret (); +&function_end_B("_vpaes_schedule_mangle"); + +# +# Interface to OpenSSL +# +&function_begin("${PREFIX}_set_encrypt_key"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &mov ($magic,0x30); + &mov ($out,0); + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_encrypt_key"); + +&function_begin("${PREFIX}_set_decrypt_key"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($round,&wparam(1)); # bits + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &mov ($base,$round); + &shr ($base,5); + &add ($base,5); + &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5; + &shl ($base,4); + &lea ($key,&DWP(16,$key,$base)); + + &mov ($out,1); + &mov ($magic,$round); + &shr ($magic,1); + &and ($magic,32); + &xor ($magic,32); # nbist==192?0:32; + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_schedule_core"); +&set_label("pic_point"); + + &mov ("esp",&DWP(48,"esp")); + &xor ("eax","eax"); +&function_end("${PREFIX}_set_decrypt_key"); + +&function_begin("${PREFIX}_encrypt"); + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_encrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_encrypt"); + +&function_begin("${PREFIX}_decrypt"); + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &mov ($inp,&wparam(0)); # inp + &lea ($base,&DWP(-56,"esp")); + &mov ($out,&wparam(1)); # out + &and ($base,-16); + &mov ($key,&wparam(2)); # key + &xchg ($base,"esp"); # alloca + &mov (&DWP(48,"esp"),$base); + + &movdqu ("xmm0",&QWP(0,$inp)); + &call ("_vpaes_decrypt_core"); + &movdqu (&QWP(0,$out),"xmm0"); + + &mov ("esp",&DWP(48,"esp")); +&function_end("${PREFIX}_decrypt"); + +&function_begin("${PREFIX}_cbc_encrypt"); + &mov ($inp,&wparam(0)); # inp + &mov ($out,&wparam(1)); # out + &mov ($round,&wparam(2)); # len + &mov ($key,&wparam(3)); # key + &sub ($round,16); + &jc (&label("cbc_abort")); + &lea ($base,&DWP(-56,"esp")); + &mov ($const,&wparam(4)); # ivp + &and ($base,-16); + &mov ($magic,&wparam(5)); # enc + &xchg ($base,"esp"); # alloca + &movdqu ("xmm1",&QWP(0,$const)); # load IV + &sub ($out,$inp); + &mov (&DWP(48,"esp"),$base); + + &mov (&DWP(0,"esp"),$out); # save out + &mov (&DWP(4,"esp"),$key) # save key + &mov (&DWP(8,"esp"),$const); # save ivp + &mov ($out,$round); # $out works as $len + + &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point"))); + &call ("_vpaes_preheat"); +&set_label("pic_point"); + &cmp ($magic,0); + &je (&label("cbc_dec_loop")); + &jmp (&label("cbc_enc_loop")); + +&set_label("cbc_enc_loop",16); + &movdqu ("xmm0",&QWP(0,$inp)); # load input + &pxor ("xmm0","xmm1"); # inp^=iv + &call ("_vpaes_encrypt_core"); + &mov ($base,&DWP(0,"esp")); # restore out + &mov ($key,&DWP(4,"esp")); # restore key + &movdqa ("xmm1","xmm0"); + &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output + &lea ($inp,&DWP(16,$inp)); + &sub ($out,16); + &jnc (&label("cbc_enc_loop")); + &jmp (&label("cbc_done")); + +&set_label("cbc_dec_loop",16); + &movdqu ("xmm0",&QWP(0,$inp)); # load input + &movdqa (&QWP(16,"esp"),"xmm1"); # save IV + &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV + &call ("_vpaes_decrypt_core"); + &mov ($base,&DWP(0,"esp")); # restore out + &mov ($key,&DWP(4,"esp")); # restore key + &pxor ("xmm0",&QWP(16,"esp")); # out^=iv + &movdqa ("xmm1",&QWP(32,"esp")); # load next IV + &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output + &lea ($inp,&DWP(16,$inp)); + &sub ($out,16); + &jnc (&label("cbc_dec_loop")); + +&set_label("cbc_done"); + &mov ($base,&DWP(8,"esp")); # restore ivp + &mov ("esp",&DWP(48,"esp")); + &movdqu (&QWP(0,$base),"xmm1"); # write IV +&set_label("cbc_abort"); +&function_end("${PREFIX}_cbc_encrypt"); + +&asm_finish(); diff --git a/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl new file mode 100644 index 0000000000..37998db5e1 --- /dev/null +++ b/src/lib/libcrypto/aes/asm/vpaes-x86_64.pl @@ -0,0 +1,1206 @@ +#!/usr/bin/env perl + +###################################################################### +## Constant-time SSSE3 AES core implementation. +## version 0.1 +## +## By Mike Hamburg (Stanford University), 2009 +## Public domain. +## +## For details see http://shiftleft.org/papers/vector_aes/ and +## http://crypto.stanford.edu/vpaes/. + +###################################################################### +# September 2011. +# +# Interface to OpenSSL as "almost" drop-in replacement for +# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt +# doesn't handle partial vectors (doesn't have to if called from +# EVP only). "Drop-in" implies that this module doesn't share key +# schedule structure with the original nor does it make assumption +# about its alignment... +# +# Performance summary. aes-x86_64.pl column lists large-block CBC +# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per +# byte processed with 128-bit key, and vpaes-x86_64.pl column - +# [also large-block CBC] encrypt/decrypt. +# +# aes-x86_64.pl vpaes-x86_64.pl +# +# Core 2(**) 30.5/43.7/14.3 21.8/25.7(***) +# Nehalem 30.5/42.2/14.6 9.8/11.8 +# Atom 63.9/79.0/32.1 64.0/84.8(***) +# +# (*) "Hyper-threading" in the context refers rather to cache shared +# among multiple cores, than to specifically Intel HTT. As vast +# majority of contemporary cores share cache, slower code path +# is common place. In other words "with-hyper-threading-off" +# results are presented mostly for reference purposes. +# +# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe. +# +# (***) Less impressive improvement on Core 2 and Atom is due to slow +# pshufb, yet it's respectable +40%/78% improvement on Core 2 +# (as implied, over "hyper-threading-safe" code path). +# +# + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +$PREFIX="vpaes"; + +$code.=<<___; +.text + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,\@abi-omnipotent +.align 16 +_vpaes_encrypt_core: + mov %rdx, %r9 + mov \$16, %r11 + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_ipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + movdqu (%r9), %xmm5 # round0 key + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi + pshufb %xmm1, %xmm0 + pxor %xmm5, %xmm2 + pxor %xmm2, %xmm0 + add \$16, %r9 + lea .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + # middle of middle round + movdqa %xmm13, %xmm4 # 4 : sb1u + pshufb %xmm2, %xmm4 # 4 = sb1u + pxor %xmm5, %xmm4 # 4 = sb1u + k + movdqa %xmm12, %xmm0 # 0 : sb1t + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = A + movdqa %xmm15, %xmm5 # 4 : sb2u + pshufb %xmm2, %xmm5 # 4 = sb2u + movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + movdqa %xmm14, %xmm2 # 2 : sb2t + pshufb %xmm3, %xmm2 # 2 = sb2t + pxor %xmm5, %xmm2 # 2 = 2A + movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + movdqa %xmm0, %xmm3 # 3 = A + pshufb %xmm1, %xmm0 # 0 = B + add \$16, %r9 # next key + pxor %xmm2, %xmm0 # 0 = 2A+B + pshufb %xmm4, %xmm3 # 3 = D + add \$16, %r11 # next mc + pxor %xmm0, %xmm3 # 3 = 2A+B+D + pshufb %xmm1, %xmm0 # 0 = 2B+C + and \$0x30, %r11 # ... mod 4 + pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D + sub \$1,%rax # nr-- + +.Lenc_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + pandn %xmm0, %xmm1 # 1 = i<<4 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm5 # 2 : a/k + pshufb %xmm0, %xmm5 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + movdqu (%r9), %xmm5 + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + jnz .Lenc_loop + + # middle of last round + movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm5, %xmm4 # 4 = sb1u + k + pshufb %xmm3, %xmm0 # 0 = sb1t + movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm1, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,\@abi-omnipotent +.align 16 +_vpaes_decrypt_core: + mov %rdx, %r9 # load key + mov 240(%rdx),%eax + movdqa %xmm9, %xmm1 + movdqa .Lk_dipt(%rip), %xmm2 # iptlo + pandn %xmm0, %xmm1 + mov %rax, %r11 + psrld \$4, %xmm1 + movdqu (%r9), %xmm5 # round0 key + shl \$4, %r11 + pand %xmm9, %xmm0 + pshufb %xmm0, %xmm2 + movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi + xor \$0x30, %r11 + lea .Lk_dsbd(%rip),%r10 + pshufb %xmm1, %xmm0 + and \$0x30, %r11 + pxor %xmm5, %xmm2 + movdqa .Lk_mc_forward+48(%rip), %xmm5 + pxor %xmm2, %xmm0 + add \$16, %r9 + add %r10, %r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: +## +## Inverse mix columns +## + movdqa -0x20(%r10),%xmm4 # 4 : sb9u + pshufb %xmm2, %xmm4 # 4 = sb9u + pxor %xmm0, %xmm4 + movdqa -0x10(%r10),%xmm0 # 0 : sb9t + pshufb %xmm3, %xmm0 # 0 = sb9t + pxor %xmm4, %xmm0 # 0 = ch + add \$16, %r9 # next round key + + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x00(%r10),%xmm4 # 4 : sbdu + pshufb %xmm2, %xmm4 # 4 = sbdu + pxor %xmm0, %xmm4 # 4 = ch + movdqa 0x10(%r10),%xmm0 # 0 : sbdt + pshufb %xmm3, %xmm0 # 0 = sbdt + pxor %xmm4, %xmm0 # 0 = ch + sub \$1,%rax # nr-- + + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x20(%r10),%xmm4 # 4 : sbbu + pshufb %xmm2, %xmm4 # 4 = sbbu + pxor %xmm0, %xmm4 # 4 = ch + movdqa 0x30(%r10),%xmm0 # 0 : sbbt + pshufb %xmm3, %xmm0 # 0 = sbbt + pxor %xmm4, %xmm0 # 0 = ch + + pshufb %xmm5, %xmm0 # MC ch + movdqa 0x40(%r10),%xmm4 # 4 : sbeu + pshufb %xmm2, %xmm4 # 4 = sbeu + pxor %xmm0, %xmm4 # 4 = ch + movdqa 0x50(%r10),%xmm0 # 0 : sbet + pshufb %xmm3, %xmm0 # 0 = sbet + pxor %xmm4, %xmm0 # 0 = ch + + palignr \$12, %xmm5, %xmm5 + +.Ldec_entry: + # top of round + movdqa %xmm9, %xmm1 # 1 : i + pandn %xmm0, %xmm1 # 1 = i<<4 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm2 # 2 : a/k + pshufb %xmm0, %xmm2 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + movdqu (%r9), %xmm0 + jnz .Ldec_loop + + # middle of last round + movdqa 0x60(%r10), %xmm4 # 3 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + pxor %xmm0, %xmm4 # 4 = sb1u + k + movdqa 0x70(%r10), %xmm0 # 0 : sbot + movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = A + pshufb %xmm2, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_schedule_core,\@abi-omnipotent +.align 16 +_vpaes_schedule_core: + # rdi = key + # rsi = size in bits + # rdx = buffer + # rcx = direction. 0=encrypt, 1=decrypt + + call _vpaes_preheat # load the tables + movdqa .Lk_rcon(%rip), %xmm8 # load rcon + movdqu (%rdi), %xmm0 # load key (unaligned) + + # input transform + movdqa %xmm0, %xmm3 + lea .Lk_ipt(%rip), %r11 + call _vpaes_schedule_transform + movdqa %xmm0, %xmm7 + + lea .Lk_sr(%rip),%r10 + test %rcx, %rcx + jnz .Lschedule_am_decrypting + + # encrypting, output zeroth round key after transform + movdqu %xmm0, (%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + # decrypting, output zeroth round key after shiftrows + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm3 + movdqu %xmm3, (%rdx) + xor \$0x30, %r8 + +.Lschedule_go: + cmp \$192, %esi + ja .Lschedule_256 + je .Lschedule_192 + # 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov \$10, %esi + +.Loop_schedule_128: + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # write output + jmp .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + call _vpaes_schedule_transform # input transform + movdqa %xmm0, %xmm6 # save short part + pxor %xmm4, %xmm4 # clear 4 + movhlps %xmm4, %xmm6 # clobber low side with zeros + mov \$4, %esi + +.Loop_schedule_192: + call _vpaes_schedule_round + palignr \$8,%xmm6,%xmm0 + call _vpaes_schedule_mangle # save key n + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle # save key n+1 + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle # save key n+2 + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + call _vpaes_schedule_transform # input transform + mov \$7, %esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle # output low result + movdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + # high round + call _vpaes_schedule_round + dec %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + # low round. swap xmm7 and xmm6 + pshufd \$0xFF, %xmm0, %xmm0 + movdqa %xmm7, %xmm5 + movdqa %xmm6, %xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5, %xmm7 + + jmp .Loop_schedule_256 + + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 16 +.Lschedule_mangle_last: + # schedule last round key from xmm0 + lea .Lk_deskew(%rip),%r11 # prepare to deskew + test %rcx, %rcx + jnz .Lschedule_mangle_last_dec + + # encrypting + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1, %xmm0 # output permute + lea .Lk_opt(%rip), %r11 # prepare to output transform + add \$32, %rdx + +.Lschedule_mangle_last_dec: + add \$-16, %rdx + pxor .Lk_s63(%rip), %xmm0 + call _vpaes_schedule_transform # output transform + movdqu %xmm0, (%rdx) # save last key + + # cleanup + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pxor %xmm4, %xmm4 + pxor %xmm5, %xmm5 + pxor %xmm6, %xmm6 + pxor %xmm7, %xmm7 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,\@abi-omnipotent +.align 16 +_vpaes_schedule_192_smear: + pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0 + pxor %xmm0, %xmm6 # -> c+d c 0 0 + pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + pxor %xmm0, %xmm6 # -> b+c+d b+c b a + movdqa %xmm6, %xmm0 + pxor %xmm1, %xmm1 + movhlps %xmm1, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,\@abi-omnipotent +.align 16 +_vpaes_schedule_round: + # extract rcon from xmm8 + pxor %xmm1, %xmm1 + palignr \$15, %xmm8, %xmm1 + palignr \$15, %xmm8, %xmm8 + pxor %xmm1, %xmm7 + + # rotate + pshufd \$0xFF, %xmm0, %xmm0 + palignr \$1, %xmm0, %xmm0 + + # fall through... + + # low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + # smear xmm7 + movdqa %xmm7, %xmm1 + pslldq \$4, %xmm7 + pxor %xmm1, %xmm7 + movdqa %xmm7, %xmm1 + pslldq \$8, %xmm7 + pxor %xmm1, %xmm7 + pxor .Lk_s63(%rip), %xmm7 + + # subbytes + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 # 1 = i + pand %xmm9, %xmm0 # 0 = k + movdqa %xmm11, %xmm2 # 2 : a/k + pshufb %xmm0, %xmm2 # 2 = a/k + pxor %xmm1, %xmm0 # 0 = j + movdqa %xmm10, %xmm3 # 3 : 1/i + pshufb %xmm1, %xmm3 # 3 = 1/i + pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k + movdqa %xmm10, %xmm4 # 4 : 1/j + pshufb %xmm0, %xmm4 # 4 = 1/j + pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k + movdqa %xmm10, %xmm2 # 2 : 1/iak + pshufb %xmm3, %xmm2 # 2 = 1/iak + pxor %xmm0, %xmm2 # 2 = io + movdqa %xmm10, %xmm3 # 3 : 1/jak + pshufb %xmm4, %xmm3 # 3 = 1/jak + pxor %xmm1, %xmm3 # 3 = jo + movdqa %xmm13, %xmm4 # 4 : sbou + pshufb %xmm2, %xmm4 # 4 = sbou + movdqa %xmm12, %xmm0 # 0 : sbot + pshufb %xmm3, %xmm0 # 0 = sb1t + pxor %xmm4, %xmm0 # 0 = sbox output + + # add in smeared stuff + pxor %xmm7, %xmm0 + movdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,\@abi-omnipotent +.align 16 +_vpaes_schedule_transform: + movdqa %xmm9, %xmm1 + pandn %xmm0, %xmm1 + psrld \$4, %xmm1 + pand %xmm9, %xmm0 + movdqa (%r11), %xmm2 # lo + pshufb %xmm0, %xmm2 + movdqa 16(%r11), %xmm0 # hi + pshufb %xmm1, %xmm0 + pxor %xmm2, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,\@abi-omnipotent +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0, %xmm4 # save xmm0 for later + movdqa .Lk_mc_forward(%rip),%xmm5 + test %rcx, %rcx + jnz .Lschedule_mangle_dec + + # encrypting + add \$16, %rdx + pxor .Lk_s63(%rip),%xmm4 + pshufb %xmm5, %xmm4 + movdqa %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + pshufb %xmm5, %xmm4 + pxor %xmm4, %xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + # inverse mix columns + lea .Lk_dksd(%rip),%r11 + movdqa %xmm9, %xmm1 + pandn %xmm4, %xmm1 + psrld \$4, %xmm1 # 1 = hi + pand %xmm9, %xmm4 # 4 = lo + + movdqa 0x00(%r11), %xmm2 + pshufb %xmm4, %xmm2 + movdqa 0x10(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x20(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x30(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x40(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x50(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + pshufb %xmm5, %xmm3 + + movdqa 0x60(%r11), %xmm2 + pshufb %xmm4, %xmm2 + pxor %xmm3, %xmm2 + movdqa 0x70(%r11), %xmm3 + pshufb %xmm1, %xmm3 + pxor %xmm2, %xmm3 + + add \$-16, %rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10),%xmm1 + pshufb %xmm1,%xmm3 + add \$-16, %r8 + and \$0x30, %r8 + movdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +# +# Interface to OpenSSL +# +.globl ${PREFIX}_set_encrypt_key +.type ${PREFIX}_set_encrypt_key,\@function,3 +.align 16 +${PREFIX}_set_encrypt_key: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov \$0,%ecx + mov \$0x30,%r8d + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key + +.globl ${PREFIX}_set_decrypt_key +.type ${PREFIX}_set_decrypt_key,\@function,3 +.align 16 +${PREFIX}_set_decrypt_key: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Ldec_key_body: +___ +$code.=<<___; + mov %esi,%eax + shr \$5,%eax + add \$5,%eax + mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + shl \$4,%eax + lea 16(%rdx,%rax),%rdx + + mov \$1,%ecx + mov %esi,%r8d + shr \$1,%r8d + and \$32,%r8d + xor \$32,%r8d # nbits==192?0:32 + call _vpaes_schedule_core +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Ldec_key_epilogue: +___ +$code.=<<___; + xor %eax,%eax + ret +.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key + +.globl ${PREFIX}_encrypt +.type ${PREFIX}_encrypt,\@function,3 +.align 16 +${PREFIX}_encrypt: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lenc_body: +___ +$code.=<<___; + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lenc_epilogue: +___ +$code.=<<___; + ret +.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt + +.globl ${PREFIX}_decrypt +.type ${PREFIX}_decrypt,\@function,3 +.align 16 +${PREFIX}_decrypt: +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Ldec_body: +___ +$code.=<<___; + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Ldec_epilogue: +___ +$code.=<<___; + ret +.size ${PREFIX}_decrypt,.-${PREFIX}_decrypt +___ +{ +my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); +# void AES_cbc_encrypt (const void char *inp, unsigned char *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +$code.=<<___; +.globl ${PREFIX}_cbc_encrypt +.type ${PREFIX}_cbc_encrypt,\@function,6 +.align 16 +${PREFIX}_cbc_encrypt: + xchg $key,$len +___ +($len,$key)=($key,$len); +$code.=<<___; + sub \$16,$len + jc .Lcbc_abort +___ +$code.=<<___ if ($win64); + lea -0xb8(%rsp),%rsp + movaps %xmm6,0x10(%rsp) + movaps %xmm7,0x20(%rsp) + movaps %xmm8,0x30(%rsp) + movaps %xmm9,0x40(%rsp) + movaps %xmm10,0x50(%rsp) + movaps %xmm11,0x60(%rsp) + movaps %xmm12,0x70(%rsp) + movaps %xmm13,0x80(%rsp) + movaps %xmm14,0x90(%rsp) + movaps %xmm15,0xa0(%rsp) +.Lcbc_body: +___ +$code.=<<___; + movdqu ($ivp),%xmm6 # load IV + sub $inp,$out + call _vpaes_preheat + cmp \$0,${enc}d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu ($inp),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,($out,$inp) + lea 16($inp),$inp + sub \$16,$len + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu ($inp),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,($out,$inp) + lea 16($inp),$inp + sub \$16,$len + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,($ivp) # save IV +___ +$code.=<<___ if ($win64); + movaps 0x10(%rsp),%xmm6 + movaps 0x20(%rsp),%xmm7 + movaps 0x30(%rsp),%xmm8 + movaps 0x40(%rsp),%xmm9 + movaps 0x50(%rsp),%xmm10 + movaps 0x60(%rsp),%xmm11 + movaps 0x70(%rsp),%xmm12 + movaps 0x80(%rsp),%xmm13 + movaps 0x90(%rsp),%xmm14 + movaps 0xa0(%rsp),%xmm15 + lea 0xb8(%rsp),%rsp +.Lcbc_epilogue: +___ +$code.=<<___; +.Lcbc_abort: + ret +.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt +___ +} +$code.=<<___; +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_preheat,\@abi-omnipotent +.align 16 +_vpaes_preheat: + lea .Lk_s0F(%rip), %r10 + movdqa -0x20(%r10), %xmm10 # .Lk_inv + movdqa -0x10(%r10), %xmm11 # .Lk_inv+16 + movdqa 0x00(%r10), %xmm9 # .Lk_s0F + movdqa 0x30(%r10), %xmm13 # .Lk_sb1 + movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16 + movdqa 0x50(%r10), %xmm15 # .Lk_sb2 + movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16 + ret +.size _vpaes_preheat,.-_vpaes_preheat +######################################################## +## ## +## Constants ## +## ## +######################################################## +.type _vpaes_consts,\@object +.align 64 +_vpaes_consts: +.Lk_inv: # inv, inva + .quad 0x0E05060F0D080180, 0x040703090A0B0C02 + .quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: # s0F + .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: # input transform (lo, hi) + .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 + .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: # sb1u, sb1t + .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 + .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: # sb2u, sb2t + .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: # sbou, sbot + .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 + .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: # mc_forward + .quad 0x0407060500030201, 0x0C0F0E0D080B0A09 + .quad 0x080B0A0904070605, 0x000302010C0F0E0D + .quad 0x0C0F0E0D080B0A09, 0x0407060500030201 + .quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward:# mc_backward + .quad 0x0605040702010003, 0x0E0D0C0F0A09080B + .quad 0x020100030E0D0C0F, 0x0A09080B06050407 + .quad 0x0E0D0C0F0A09080B, 0x0605040702010003 + .quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: # sr + .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 + .quad 0x030E09040F0A0500, 0x0B06010C07020D08 + .quad 0x0F060D040B020900, 0x070E050C030A0108 + .quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: # rcon + .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: # s63: all equal to 0x63 transformed + .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: # output transform + .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 + .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: # deskew tables: inverts the sbox's "skew" + .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A + .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +## +## Decryption stuff +## Key schedule constants +## +.Lk_dksd: # decryption key schedule: invskew x*D + .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 + .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: # decryption key schedule: invskew x*B + .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 + .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: # decryption key schedule: invskew x*E + 0x63 + .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 + .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: # decryption key schedule: invskew x*9 + .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC + .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +## +## Decryption stuff +## Round function constants +## +.Lk_dipt: # decryption input transform + .quad 0x0F505B040B545F00, 0x154A411E114E451A + .quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: # decryption sbox output *9*u, *9*t + .quad 0x851C03539A86D600, 0xCAD51F504F994CC9 + .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: # decryption sbox output *D*u, *D*t + .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 + .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: # decryption sbox output *B*u, *B*t + .quad 0xD022649296B44200, 0x602646F6B0F2D404 + .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: # decryption sbox output *E*u, *E*t + .quad 0x46F2929626D4D000, 0x2242600464B4F6B0 + .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: # decryption sbox final output + .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D + .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)" +.align 64 +.size _vpaes_consts,.-_vpaes_consts +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 16(%rax),%rsi # %xmm save area + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) + .long 0xa548f3fc # cld; rep movsq + lea 0xb8(%rax),%rax # adjust stack pointer + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_${PREFIX}_set_encrypt_key + .rva .LSEH_end_${PREFIX}_set_encrypt_key + .rva .LSEH_info_${PREFIX}_set_encrypt_key + + .rva .LSEH_begin_${PREFIX}_set_decrypt_key + .rva .LSEH_end_${PREFIX}_set_decrypt_key + .rva .LSEH_info_${PREFIX}_set_decrypt_key + + .rva .LSEH_begin_${PREFIX}_encrypt + .rva .LSEH_end_${PREFIX}_encrypt + .rva .LSEH_info_${PREFIX}_encrypt + + .rva .LSEH_begin_${PREFIX}_decrypt + .rva .LSEH_end_${PREFIX}_decrypt + .rva .LSEH_info_${PREFIX}_decrypt + + .rva .LSEH_begin_${PREFIX}_cbc_encrypt + .rva .LSEH_end_${PREFIX}_cbc_encrypt + .rva .LSEH_info_${PREFIX}_cbc_encrypt + +.section .xdata +.align 8 +.LSEH_info_${PREFIX}_set_encrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_set_decrypt_key: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lenc_body,.Lenc_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_decrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Ldec_body,.Ldec_epilogue # HandlerData[] +.LSEH_info_${PREFIX}_cbc_encrypt: + .byte 9,0,0,0 + .rva se_handler + .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[] +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/src/lib/libcrypto/arm_arch.h b/src/lib/libcrypto/arm_arch.h new file mode 100644 index 0000000000..5a83107680 --- /dev/null +++ b/src/lib/libcrypto/arm_arch.h @@ -0,0 +1,51 @@ +#ifndef __ARM_ARCH_H__ +#define __ARM_ARCH_H__ + +#if !defined(__ARM_ARCH__) +# if defined(__CC_ARM) +# define __ARM_ARCH__ __TARGET_ARCH_ARM +# if defined(__BIG_ENDIAN) +# define __ARMEB__ +# else +# define __ARMEL__ +# endif +# elif defined(__GNUC__) + /* + * Why doesn't gcc define __ARM_ARCH__? Instead it defines + * bunch of below macros. See all_architectires[] table in + * gcc/config/arm/arm.c. On a side note it defines + * __ARMEL__/__ARMEB__ for little-/big-endian. + */ +# if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__)|| defined(__ARM_ARCH_7M__) || \ + defined(__ARM_ARCH_7EM__) +# define __ARM_ARCH__ 7 +# elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6K__)|| defined(__ARM_ARCH_6M__) || \ + defined(__ARM_ARCH_6Z__)|| defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6T2__) +# define __ARM_ARCH__ 6 +# elif defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) || \ + defined(__ARM_ARCH_5E__)|| defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) +# define __ARM_ARCH__ 5 +# elif defined(__ARM_ARCH_4__) || defined(__ARM_ARCH_4T__) +# define __ARM_ARCH__ 4 +# else +# error "unsupported ARM architecture" +# endif +# endif +#endif + +#ifdef OPENSSL_FIPSCANISTER +#include +#endif + +#if !__ASSEMBLER__ +extern unsigned int OPENSSL_armcap_P; + +#define ARMV7_NEON (1<<0) +#define ARMV7_TICK (1<<1) +#endif + +#endif diff --git a/src/lib/libcrypto/armcap.c b/src/lib/libcrypto/armcap.c new file mode 100644 index 0000000000..5258d2fbdd --- /dev/null +++ b/src/lib/libcrypto/armcap.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include + +#include "arm_arch.h" + +unsigned int OPENSSL_armcap_P; + +static sigset_t all_masked; + +static sigjmp_buf ill_jmp; +static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } + +/* + * Following subroutines could have been inlined, but it's not all + * ARM compilers support inline assembler... + */ +void _armv7_neon_probe(void); +unsigned int _armv7_tick(void); + +unsigned int OPENSSL_rdtsc(void) + { + if (OPENSSL_armcap_P|ARMV7_TICK) + return _armv7_tick(); + else + return 0; + } + +#if defined(__GNUC__) && __GNUC__>=2 +void OPENSSL_cpuid_setup(void) __attribute__((constructor)); +#endif +void OPENSSL_cpuid_setup(void) + { + char *e; + struct sigaction ill_oact,ill_act; + sigset_t oset; + static int trigger=0; + + if (trigger) return; + trigger=1; + + if ((e=getenv("OPENSSL_armcap"))) + { + OPENSSL_armcap_P=strtoul(e,NULL,0); + return; + } + + sigfillset(&all_masked); + sigdelset(&all_masked,SIGILL); + sigdelset(&all_masked,SIGTRAP); + sigdelset(&all_masked,SIGFPE); + sigdelset(&all_masked,SIGBUS); + sigdelset(&all_masked,SIGSEGV); + + OPENSSL_armcap_P = 0; + + memset(&ill_act,0,sizeof(ill_act)); + ill_act.sa_handler = ill_handler; + ill_act.sa_mask = all_masked; + + sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); + sigaction(SIGILL,&ill_act,&ill_oact); + + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv7_neon_probe(); + OPENSSL_armcap_P |= ARMV7_NEON; + } + if (sigsetjmp(ill_jmp,1) == 0) + { + _armv7_tick(); + OPENSSL_armcap_P |= ARMV7_TICK; + } + + sigaction (SIGILL,&ill_oact,NULL); + sigprocmask(SIG_SETMASK,&oset,NULL); + } diff --git a/src/lib/libcrypto/armv4cpuid.S b/src/lib/libcrypto/armv4cpuid.S new file mode 100644 index 0000000000..2d618deaa4 --- /dev/null +++ b/src/lib/libcrypto/armv4cpuid.S @@ -0,0 +1,154 @@ +#include "arm_arch.h" + +.text +.code 32 + +.align 5 +.global _armv7_neon_probe +.type _armv7_neon_probe,%function +_armv7_neon_probe: + .word 0xf26ee1fe @ vorr q15,q15,q15 + .word 0xe12fff1e @ bx lr +.size _armv7_neon_probe,.-_armv7_neon_probe + +.global _armv7_tick +.type _armv7_tick,%function +_armv7_tick: + mrc p15,0,r0,c9,c13,0 + .word 0xe12fff1e @ bx lr +.size _armv7_tick,.-_armv7_tick + +.global OPENSSL_atomic_add +.type OPENSSL_atomic_add,%function +OPENSSL_atomic_add: +#if __ARM_ARCH__>=6 +.Ladd: ldrex r2,[r0] + add r3,r2,r1 + strex r2,r3,[r0] + cmp r2,#0 + bne .Ladd + mov r0,r3 + .word 0xe12fff1e @ bx lr +#else + stmdb sp!,{r4-r6,lr} + ldr r2,.Lspinlock + adr r3,.Lspinlock + mov r4,r0 + mov r5,r1 + add r6,r3,r2 @ &spinlock + b .+8 +.Lspin: bl sched_yield + mov r0,#-1 + swp r0,r0,[r6] + cmp r0,#0 + bne .Lspin + + ldr r2,[r4] + add r2,r2,r5 + str r2,[r4] + str r0,[r6] @ release spinlock + ldmia sp!,{r4-r6,lr} + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +#endif +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add + +.global OPENSSL_cleanse +.type OPENSSL_cleanse,%function +OPENSSL_cleanse: + eor ip,ip,ip + cmp r1,#7 + subhs r1,r1,#4 + bhs .Lot + cmp r1,#0 + beq .Lcleanse_done +.Little: + strb ip,[r0],#1 + subs r1,r1,#1 + bhi .Little + b .Lcleanse_done + +.Lot: tst r0,#3 + beq .Laligned + strb ip,[r0],#1 + sub r1,r1,#1 + b .Lot +.Laligned: + str ip,[r0],#4 + subs r1,r1,#4 + bhs .Laligned + adds r1,r1,#4 + bne .Little +.Lcleanse_done: + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_cleanse,.-OPENSSL_cleanse + +.global OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,%function +OPENSSL_wipe_cpu: + ldr r0,.LOPENSSL_armcap + adr r1,.LOPENSSL_armcap + ldr r0,[r1,r0] + eor r2,r2,r2 + eor r3,r3,r3 + eor ip,ip,ip + tst r0,#1 + beq .Lwipe_done + .word 0xf3000150 @ veor q0, q0, q0 + .word 0xf3022152 @ veor q1, q1, q1 + .word 0xf3044154 @ veor q2, q2, q2 + .word 0xf3066156 @ veor q3, q3, q3 + .word 0xf34001f0 @ veor q8, q8, q8 + .word 0xf34221f2 @ veor q9, q9, q9 + .word 0xf34441f4 @ veor q10, q10, q10 + .word 0xf34661f6 @ veor q11, q11, q11 + .word 0xf34881f8 @ veor q12, q12, q12 + .word 0xf34aa1fa @ veor q13, q13, q13 + .word 0xf34cc1fc @ veor q14, q14, q14 + .word 0xf34ee1fe @ veor q15, q15, q15 +.Lwipe_done: + mov r0,sp + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu + +.global OPENSSL_instrument_bus +.type OPENSSL_instrument_bus,%function +OPENSSL_instrument_bus: + eor r0,r0,r0 + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_instrument_bus,.-OPENSSL_instrument_bus + +.global OPENSSL_instrument_bus2 +.type OPENSSL_instrument_bus2,%function +OPENSSL_instrument_bus2: + eor r0,r0,r0 + tst lr,#1 + moveq pc,lr + .word 0xe12fff1e @ bx lr +.size OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2 + +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.LOPENSSL_armcap +#if __ARM_ARCH__>=6 +.align 5 +#else +.Lspinlock: +.word atomic_add_spinlock-.Lspinlock +.align 5 + +.data +.align 2 +atomic_add_spinlock: +.word 0 +#endif + +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P diff --git a/src/lib/libcrypto/asn1/a_d2i_fp.c b/src/lib/libcrypto/asn1/a_d2i_fp.c index ece40bc4c0..52b2ebdb63 100644 --- a/src/lib/libcrypto/asn1/a_d2i_fp.c +++ b/src/lib/libcrypto/asn1/a_d2i_fp.c @@ -57,6 +57,7 @@ */ #include +#include #include "cryptlib.h" #include #include @@ -143,17 +144,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) BUF_MEM *b; unsigned char *p; int i; - int ret=-1; ASN1_const_CTX c; - int want=HEADER_SIZE; + size_t want=HEADER_SIZE; int eos=0; -#if defined(__GNUC__) && defined(__ia64) - /* pathetic compiler bug in all known versions as of Nov. 2002 */ - long off=0; -#else - int off=0; -#endif - int len=0; + size_t off=0; + size_t len=0; b=BUF_MEM_new(); if (b == NULL) @@ -169,7 +164,7 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) { want-=(len-off); - if (!BUF_MEM_grow_clean(b,len+want)) + if (len + want < len || !BUF_MEM_grow_clean(b,len+want)) { ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE); goto err; @@ -181,7 +176,14 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) goto err; } if (i > 0) + { + if (len+i < len) + { + ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG); + goto err; + } len+=i; + } } /* else data already loaded */ @@ -206,6 +208,11 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) { /* no data body so go round again */ eos++; + if (eos < 0) + { + ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_HEADER_TOO_LONG); + goto err; + } want=HEADER_SIZE; } else if (eos && (c.slen == 0) && (c.tag == V_ASN1_EOC)) @@ -220,10 +227,16 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) else { /* suck in c.slen bytes of data */ - want=(int)c.slen; + want=c.slen; if (want > (len-off)) { want-=(len-off); + if (want > INT_MAX /* BIO_read takes an int length */ || + len+want < len) + { + ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG); + goto err; + } if (!BUF_MEM_grow_clean(b,len+want)) { ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ERR_R_MALLOC_FAILURE); @@ -238,11 +251,18 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) ASN1_R_NOT_ENOUGH_DATA); goto err; } + /* This can't overflow because + * |len+want| didn't overflow. */ len+=i; - want -= i; + want-=i; } } - off+=(int)c.slen; + if (off + c.slen < off) + { + ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG); + goto err; + } + off+=c.slen; if (eos <= 0) { break; @@ -252,9 +272,15 @@ static int asn1_d2i_read_bio(BIO *in, BUF_MEM **pb) } } + if (off > INT_MAX) + { + ASN1err(ASN1_F_ASN1_D2I_READ_BIO,ASN1_R_TOO_LONG); + goto err; + } + *pb = b; return off; err: if (b != NULL) BUF_MEM_free(b); - return(ret); + return -1; } diff --git a/src/lib/libcrypto/asn1/a_digest.c b/src/lib/libcrypto/asn1/a_digest.c index d00d9e22b1..cbdeea6ac0 100644 --- a/src/lib/libcrypto/asn1/a_digest.c +++ b/src/lib/libcrypto/asn1/a_digest.c @@ -87,7 +87,8 @@ int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data, p=str; i2d(data,&p); - EVP_Digest(str, i, md, len, type, NULL); + if (!EVP_Digest(str, i, md, len, type, NULL)) + return 0; OPENSSL_free(str); return(1); } @@ -104,7 +105,8 @@ int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn, i=ASN1_item_i2d(asn,&str, it); if (!str) return(0); - EVP_Digest(str, i, md, len, type, NULL); + if (!EVP_Digest(str, i, md, len, type, NULL)) + return 0; OPENSSL_free(str); return(1); } diff --git a/src/lib/libcrypto/asn1/a_int.c b/src/lib/libcrypto/asn1/a_int.c index 3348b8762c..ad0d2506f6 100644 --- a/src/lib/libcrypto/asn1/a_int.c +++ b/src/lib/libcrypto/asn1/a_int.c @@ -386,8 +386,8 @@ long ASN1_INTEGER_get(const ASN1_INTEGER *a) if (a->length > (int)sizeof(long)) { - /* hmm... a bit ugly */ - return(0xffffffffL); + /* hmm... a bit ugly, return all ones */ + return -1; } if (a->data == NULL) return 0; diff --git a/src/lib/libcrypto/asn1/a_sign.c b/src/lib/libcrypto/asn1/a_sign.c index ff63bfc7be..7b4a193d6b 100644 --- a/src/lib/libcrypto/asn1/a_sign.c +++ b/src/lib/libcrypto/asn1/a_sign.c @@ -184,9 +184,9 @@ int ASN1_sign(i2d_of_void *i2d, X509_ALGOR *algor1, X509_ALGOR *algor2, p=buf_in; i2d(data,&p); - EVP_SignInit_ex(&ctx,type, NULL); - EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl); - if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out, + if (!EVP_SignInit_ex(&ctx,type, NULL) + || !EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl) + || !EVP_SignFinal(&ctx,(unsigned char *)buf_out, (unsigned int *)&outl,pkey)) { outl=0; @@ -218,65 +218,100 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2, const EVP_MD *type) { EVP_MD_CTX ctx; + EVP_MD_CTX_init(&ctx); + if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey)) + { + EVP_MD_CTX_cleanup(&ctx); + return 0; + } + return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx); + } + + +int ASN1_item_sign_ctx(const ASN1_ITEM *it, + X509_ALGOR *algor1, X509_ALGOR *algor2, + ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx) + { + const EVP_MD *type; + EVP_PKEY *pkey; unsigned char *buf_in=NULL,*buf_out=NULL; - int inl=0,outl=0,outll=0; + size_t inl=0,outl=0,outll=0; int signid, paramtype; + int rv; + + type = EVP_MD_CTX_md(ctx); + pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx); - if (type == NULL) + if (!type || !pkey) { - int def_nid; - if (EVP_PKEY_get_default_digest_nid(pkey, &def_nid) > 0) - type = EVP_get_digestbynid(def_nid); + ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ASN1_R_CONTEXT_NOT_INITIALISED); + return 0; } - if (type == NULL) + if (pkey->ameth->item_sign) { - ASN1err(ASN1_F_ASN1_ITEM_SIGN, ASN1_R_NO_DEFAULT_DIGEST); - return 0; + rv = pkey->ameth->item_sign(ctx, it, asn, algor1, algor2, + signature); + if (rv == 1) + outl = signature->length; + /* Return value meanings: + * <=0: error. + * 1: method does everything. + * 2: carry on as normal. + * 3: ASN1 method sets algorithm identifiers: just sign. + */ + if (rv <= 0) + ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, ERR_R_EVP_LIB); + if (rv <= 1) + goto err; } + else + rv = 2; - if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE) + if (rv == 2) { - if (!pkey->ameth || - !OBJ_find_sigid_by_algs(&signid, EVP_MD_nid(type), - pkey->ameth->pkey_id)) + if (type->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE) { - ASN1err(ASN1_F_ASN1_ITEM_SIGN, - ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED); - return 0; + if (!pkey->ameth || + !OBJ_find_sigid_by_algs(&signid, + EVP_MD_nid(type), + pkey->ameth->pkey_id)) + { + ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX, + ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED); + return 0; + } } - } - else - signid = type->pkey_type; + else + signid = type->pkey_type; - if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL) - paramtype = V_ASN1_NULL; - else - paramtype = V_ASN1_UNDEF; + if (pkey->ameth->pkey_flags & ASN1_PKEY_SIGPARAM_NULL) + paramtype = V_ASN1_NULL; + else + paramtype = V_ASN1_UNDEF; - if (algor1) - X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL); - if (algor2) - X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL); + if (algor1) + X509_ALGOR_set0(algor1, OBJ_nid2obj(signid), paramtype, NULL); + if (algor2) + X509_ALGOR_set0(algor2, OBJ_nid2obj(signid), paramtype, NULL); + + } - EVP_MD_CTX_init(&ctx); inl=ASN1_item_i2d(asn,&buf_in, it); outll=outl=EVP_PKEY_size(pkey); - buf_out=(unsigned char *)OPENSSL_malloc((unsigned int)outl); + buf_out=OPENSSL_malloc((unsigned int)outl); if ((buf_in == NULL) || (buf_out == NULL)) { outl=0; - ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_MALLOC_FAILURE); + ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_MALLOC_FAILURE); goto err; } - EVP_SignInit_ex(&ctx,type, NULL); - EVP_SignUpdate(&ctx,(unsigned char *)buf_in,inl); - if (!EVP_SignFinal(&ctx,(unsigned char *)buf_out, - (unsigned int *)&outl,pkey)) + if (!EVP_DigestSignUpdate(ctx, buf_in, inl) + || !EVP_DigestSignFinal(ctx, buf_out, &outl)) { outl=0; - ASN1err(ASN1_F_ASN1_ITEM_SIGN,ERR_R_EVP_LIB); + ASN1err(ASN1_F_ASN1_ITEM_SIGN_CTX,ERR_R_EVP_LIB); goto err; } if (signature->data != NULL) OPENSSL_free(signature->data); @@ -289,7 +324,7 @@ int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2, signature->flags&= ~(ASN1_STRING_FLAG_BITS_LEFT|0x07); signature->flags|=ASN1_STRING_FLAG_BITS_LEFT; err: - EVP_MD_CTX_cleanup(&ctx); + EVP_MD_CTX_cleanup(ctx); if (buf_in != NULL) { OPENSSL_cleanse((char *)buf_in,(unsigned int)inl); OPENSSL_free(buf_in); } if (buf_out != NULL) diff --git a/src/lib/libcrypto/asn1/a_verify.c b/src/lib/libcrypto/asn1/a_verify.c index cecdb13c70..432722e409 100644 --- a/src/lib/libcrypto/asn1/a_verify.c +++ b/src/lib/libcrypto/asn1/a_verify.c @@ -101,8 +101,13 @@ int ASN1_verify(i2d_of_void *i2d, X509_ALGOR *a, ASN1_BIT_STRING *signature, p=buf_in; i2d(data,&p); - EVP_VerifyInit_ex(&ctx,type, NULL); - EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl); + if (!EVP_VerifyInit_ex(&ctx,type, NULL) + || !EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl)) + { + ASN1err(ASN1_F_ASN1_VERIFY,ERR_R_EVP_LIB); + ret=0; + goto err; + } OPENSSL_cleanse(buf_in,(unsigned int)inl); OPENSSL_free(buf_in); @@ -126,11 +131,10 @@ err: #endif -int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signature, - void *asn, EVP_PKEY *pkey) +int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, + ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey) { EVP_MD_CTX ctx; - const EVP_MD *type = NULL; unsigned char *buf_in=NULL; int ret= -1,inl; @@ -144,25 +148,47 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM); goto err; } - type=EVP_get_digestbynid(mdnid); - if (type == NULL) + if (mdnid == NID_undef) { - ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM); - goto err; + if (!pkey->ameth || !pkey->ameth->item_verify) + { + ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM); + goto err; + } + ret = pkey->ameth->item_verify(&ctx, it, asn, a, + signature, pkey); + /* Return value of 2 means carry on, anything else means we + * exit straight away: either a fatal error of the underlying + * verification routine handles all verification. + */ + if (ret != 2) + goto err; + ret = -1; } - - /* Check public key OID matches public key type */ - if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id) + else { - ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE); - goto err; - } + const EVP_MD *type; + type=EVP_get_digestbynid(mdnid); + if (type == NULL) + { + ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM); + goto err; + } + + /* Check public key OID matches public key type */ + if (EVP_PKEY_type(pknid) != pkey->ameth->pkey_id) + { + ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ASN1_R_WRONG_PUBLIC_KEY_TYPE); + goto err; + } + + if (!EVP_DigestVerifyInit(&ctx, NULL, type, NULL, pkey)) + { + ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB); + ret=0; + goto err; + } - if (!EVP_VerifyInit_ex(&ctx,type, NULL)) - { - ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB); - ret=0; - goto err; } inl = ASN1_item_i2d(asn, &buf_in, it); @@ -173,13 +199,18 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *a, ASN1_BIT_STRING *signat goto err; } - EVP_VerifyUpdate(&ctx,(unsigned char *)buf_in,inl); + if (!EVP_DigestVerifyUpdate(&ctx,buf_in,inl)) + { + ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB); + ret=0; + goto err; + } OPENSSL_cleanse(buf_in,(unsigned int)inl); OPENSSL_free(buf_in); - if (EVP_VerifyFinal(&ctx,(unsigned char *)signature->data, - (unsigned int)signature->length,pkey) <= 0) + if (EVP_DigestVerifyFinal(&ctx,signature->data, + (size_t)signature->length) <= 0) { ASN1err(ASN1_F_ASN1_ITEM_VERIFY,ERR_R_EVP_LIB); ret=0; diff --git a/src/lib/libcrypto/asn1/ameth_lib.c b/src/lib/libcrypto/asn1/ameth_lib.c index 5a581b90ea..a19e058fca 100644 --- a/src/lib/libcrypto/asn1/ameth_lib.c +++ b/src/lib/libcrypto/asn1/ameth_lib.c @@ -69,6 +69,7 @@ extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[]; extern const EVP_PKEY_ASN1_METHOD dh_asn1_meth; extern const EVP_PKEY_ASN1_METHOD eckey_asn1_meth; extern const EVP_PKEY_ASN1_METHOD hmac_asn1_meth; +extern const EVP_PKEY_ASN1_METHOD cmac_asn1_meth; /* Keep this sorted in type order !! */ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = @@ -90,7 +91,8 @@ static const EVP_PKEY_ASN1_METHOD *standard_methods[] = #ifndef OPENSSL_NO_EC &eckey_asn1_meth, #endif - &hmac_asn1_meth + &hmac_asn1_meth, + &cmac_asn1_meth }; typedef int sk_cmp_fn_type(const char * const *a, const char * const *b); @@ -291,6 +293,8 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, if (!ameth) return NULL; + memset(ameth, 0, sizeof(EVP_PKEY_ASN1_METHOD)); + ameth->pkey_id = id; ameth->pkey_base_id = id; ameth->pkey_flags = flags | ASN1_PKEY_DYNAMIC; @@ -325,6 +329,9 @@ EVP_PKEY_ASN1_METHOD* EVP_PKEY_asn1_new(int id, int flags, ameth->old_priv_encode = 0; ameth->old_priv_decode = 0; + ameth->item_verify = 0; + ameth->item_sign = 0; + ameth->pkey_size = 0; ameth->pkey_bits = 0; @@ -376,6 +383,9 @@ void EVP_PKEY_asn1_copy(EVP_PKEY_ASN1_METHOD *dst, dst->pkey_free = src->pkey_free; dst->pkey_ctrl = src->pkey_ctrl; + dst->item_sign = src->item_sign; + dst->item_verify = src->item_verify; + } void EVP_PKEY_asn1_free(EVP_PKEY_ASN1_METHOD *ameth) diff --git a/src/lib/libcrypto/asn1/asn1.h b/src/lib/libcrypto/asn1/asn1.h index 59540e4e79..220a0c8c63 100644 --- a/src/lib/libcrypto/asn1/asn1.h +++ b/src/lib/libcrypto/asn1/asn1.h @@ -235,7 +235,7 @@ typedef struct asn1_object_st */ #define ASN1_STRING_FLAG_MSTRING 0x040 /* This is the base type that holds just about everything :-) */ -typedef struct asn1_string_st +struct asn1_string_st { int length; int type; @@ -245,7 +245,7 @@ typedef struct asn1_string_st * input data has a non-zero 'unused bits' value, it will be * handled correctly */ long flags; - } ASN1_STRING; + }; /* ASN1_ENCODING structure: this is used to save the received * encoding of an ASN1 type. This is useful to get round @@ -293,7 +293,6 @@ DECLARE_STACK_OF(ASN1_STRING_TABLE) * see asn1t.h */ typedef struct ASN1_TEMPLATE_st ASN1_TEMPLATE; -typedef struct ASN1_ITEM_st ASN1_ITEM; typedef struct ASN1_TLC_st ASN1_TLC; /* This is just an opaque pointer */ typedef struct ASN1_VALUE_st ASN1_VALUE; @@ -1194,6 +1193,7 @@ void ERR_load_ASN1_strings(void); #define ASN1_F_ASN1_ITEM_I2D_FP 193 #define ASN1_F_ASN1_ITEM_PACK 198 #define ASN1_F_ASN1_ITEM_SIGN 195 +#define ASN1_F_ASN1_ITEM_SIGN_CTX 220 #define ASN1_F_ASN1_ITEM_UNPACK 199 #define ASN1_F_ASN1_ITEM_VERIFY 197 #define ASN1_F_ASN1_MBSTRING_NCOPY 122 @@ -1266,6 +1266,7 @@ void ERR_load_ASN1_strings(void); #define ASN1_F_PKCS5_PBE2_SET_IV 167 #define ASN1_F_PKCS5_PBE_SET 202 #define ASN1_F_PKCS5_PBE_SET0_ALGOR 215 +#define ASN1_F_PKCS5_PBKDF2_SET 219 #define ASN1_F_SMIME_READ_ASN1 212 #define ASN1_F_SMIME_TEXT 213 #define ASN1_F_X509_CINF_NEW 168 @@ -1291,6 +1292,7 @@ void ERR_load_ASN1_strings(void); #define ASN1_R_BOOLEAN_IS_WRONG_LENGTH 106 #define ASN1_R_BUFFER_TOO_SMALL 107 #define ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER 108 +#define ASN1_R_CONTEXT_NOT_INITIALISED 217 #define ASN1_R_DATA_IS_WRONG 109 #define ASN1_R_DECODE_ERROR 110 #define ASN1_R_DECODING_ERROR 111 diff --git a/src/lib/libcrypto/asn1/asn1_err.c b/src/lib/libcrypto/asn1/asn1_err.c index 6e04d08f31..1a30bf119b 100644 --- a/src/lib/libcrypto/asn1/asn1_err.c +++ b/src/lib/libcrypto/asn1/asn1_err.c @@ -1,6 +1,6 @@ /* crypto/asn1/asn1_err.c */ /* ==================================================================== - * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -107,6 +107,7 @@ static ERR_STRING_DATA ASN1_str_functs[]= {ERR_FUNC(ASN1_F_ASN1_ITEM_I2D_FP), "ASN1_item_i2d_fp"}, {ERR_FUNC(ASN1_F_ASN1_ITEM_PACK), "ASN1_item_pack"}, {ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN), "ASN1_item_sign"}, +{ERR_FUNC(ASN1_F_ASN1_ITEM_SIGN_CTX), "ASN1_item_sign_ctx"}, {ERR_FUNC(ASN1_F_ASN1_ITEM_UNPACK), "ASN1_item_unpack"}, {ERR_FUNC(ASN1_F_ASN1_ITEM_VERIFY), "ASN1_item_verify"}, {ERR_FUNC(ASN1_F_ASN1_MBSTRING_NCOPY), "ASN1_mbstring_ncopy"}, @@ -179,6 +180,7 @@ static ERR_STRING_DATA ASN1_str_functs[]= {ERR_FUNC(ASN1_F_PKCS5_PBE2_SET_IV), "PKCS5_pbe2_set_iv"}, {ERR_FUNC(ASN1_F_PKCS5_PBE_SET), "PKCS5_pbe_set"}, {ERR_FUNC(ASN1_F_PKCS5_PBE_SET0_ALGOR), "PKCS5_pbe_set0_algor"}, +{ERR_FUNC(ASN1_F_PKCS5_PBKDF2_SET), "PKCS5_pbkdf2_set"}, {ERR_FUNC(ASN1_F_SMIME_READ_ASN1), "SMIME_read_ASN1"}, {ERR_FUNC(ASN1_F_SMIME_TEXT), "SMIME_text"}, {ERR_FUNC(ASN1_F_X509_CINF_NEW), "X509_CINF_NEW"}, @@ -207,6 +209,7 @@ static ERR_STRING_DATA ASN1_str_reasons[]= {ERR_REASON(ASN1_R_BOOLEAN_IS_WRONG_LENGTH),"boolean is wrong length"}, {ERR_REASON(ASN1_R_BUFFER_TOO_SMALL) ,"buffer too small"}, {ERR_REASON(ASN1_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER),"cipher has no object identifier"}, +{ERR_REASON(ASN1_R_CONTEXT_NOT_INITIALISED),"context not initialised"}, {ERR_REASON(ASN1_R_DATA_IS_WRONG) ,"data is wrong"}, {ERR_REASON(ASN1_R_DECODE_ERROR) ,"decode error"}, {ERR_REASON(ASN1_R_DECODING_ERROR) ,"decoding error"}, diff --git a/src/lib/libcrypto/asn1/asn1_locl.h b/src/lib/libcrypto/asn1/asn1_locl.h index 5aa65e28f5..9fcf0d9530 100644 --- a/src/lib/libcrypto/asn1/asn1_locl.h +++ b/src/lib/libcrypto/asn1/asn1_locl.h @@ -102,6 +102,10 @@ struct evp_pkey_asn1_method_st int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, ASN1_PCTX *pctx); + int (*sig_print)(BIO *out, + const X509_ALGOR *sigalg, const ASN1_STRING *sig, + int indent, ASN1_PCTX *pctx); + void (*pkey_free)(EVP_PKEY *pkey); int (*pkey_ctrl)(EVP_PKEY *pkey, int op, long arg1, void *arg2); @@ -111,6 +115,13 @@ struct evp_pkey_asn1_method_st int (*old_priv_decode)(EVP_PKEY *pkey, const unsigned char **pder, int derlen); int (*old_priv_encode)(const EVP_PKEY *pkey, unsigned char **pder); + /* Custom ASN1 signature verification */ + int (*item_verify)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *a, ASN1_BIT_STRING *sig, + EVP_PKEY *pkey); + int (*item_sign)(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *alg1, X509_ALGOR *alg2, + ASN1_BIT_STRING *sig); } /* EVP_PKEY_ASN1_METHOD */; diff --git a/src/lib/libcrypto/asn1/asn_mime.c b/src/lib/libcrypto/asn1/asn_mime.c index c1d1b12291..54a704a969 100644 --- a/src/lib/libcrypto/asn1/asn_mime.c +++ b/src/lib/libcrypto/asn1/asn_mime.c @@ -377,8 +377,12 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags, BIO *tmpbio; const ASN1_AUX *aux = it->funcs; ASN1_STREAM_ARG sarg; + int rv = 1; - if (!(flags & SMIME_DETACHED)) + /* If data is not deteched or resigning then the output BIO is + * already set up to finalise when it is written through. + */ + if (!(flags & SMIME_DETACHED) || (flags & PKCS7_REUSE_DIGEST)) { SMIME_crlf_copy(data, out, flags); return 1; @@ -405,7 +409,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags, /* Finalize structure */ if (aux->asn1_cb(ASN1_OP_DETACHED_POST, &val, it, &sarg) <= 0) - return 0; + rv = 0; /* Now remove any digests prepended to the BIO */ @@ -416,7 +420,7 @@ static int asn1_output_data(BIO *out, BIO *data, ASN1_VALUE *val, int flags, sarg.ndef_bio = tmpbio; } - return 1; + return rv; } @@ -486,9 +490,9 @@ ASN1_VALUE *SMIME_read_ASN1(BIO *bio, BIO **bcont, const ASN1_ITEM *it) if(strcmp(hdr->value, "application/x-pkcs7-signature") && strcmp(hdr->value, "application/pkcs7-signature")) { - sk_MIME_HEADER_pop_free(headers, mime_hdr_free); ASN1err(ASN1_F_SMIME_READ_ASN1,ASN1_R_SIG_INVALID_MIME_TYPE); ERR_add_error_data(2, "type: ", hdr->value); + sk_MIME_HEADER_pop_free(headers, mime_hdr_free); sk_BIO_pop_free(parts, BIO_vfree); return NULL; } @@ -801,7 +805,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value) if(name) { if(!(tmpname = BUF_strdup(name))) return NULL; for(p = tmpname ; *p; p++) { - c = *p; + c = (unsigned char)*p; if(isupper(c)) { c = tolower(c); *p = c; @@ -811,7 +815,7 @@ static MIME_HEADER *mime_hdr_new(char *name, char *value) if(value) { if(!(tmpval = BUF_strdup(value))) return NULL; for(p = tmpval ; *p; p++) { - c = *p; + c = (unsigned char)*p; if(isupper(c)) { c = tolower(c); *p = c; @@ -835,7 +839,7 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value) tmpname = BUF_strdup(name); if(!tmpname) return 0; for(p = tmpname ; *p; p++) { - c = *p; + c = (unsigned char)*p; if(isupper(c)) { c = tolower(c); *p = c; @@ -858,12 +862,17 @@ static int mime_hdr_addparam(MIME_HEADER *mhdr, char *name, char *value) static int mime_hdr_cmp(const MIME_HEADER * const *a, const MIME_HEADER * const *b) { + if (!(*a)->name || !(*b)->name) + return !!(*a)->name - !!(*b)->name; + return(strcmp((*a)->name, (*b)->name)); } static int mime_param_cmp(const MIME_PARAM * const *a, const MIME_PARAM * const *b) { + if (!(*a)->param_name || !(*b)->param_name) + return !!(*a)->param_name - !!(*b)->param_name; return(strcmp((*a)->param_name, (*b)->param_name)); } diff --git a/src/lib/libcrypto/asn1/n_pkey.c b/src/lib/libcrypto/asn1/n_pkey.c index e7d0439062..e251739933 100644 --- a/src/lib/libcrypto/asn1/n_pkey.c +++ b/src/lib/libcrypto/asn1/n_pkey.c @@ -129,6 +129,7 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp, unsigned char buf[256],*zz; unsigned char key[EVP_MAX_KEY_LENGTH]; EVP_CIPHER_CTX ctx; + EVP_CIPHER_CTX_init(&ctx); if (a == NULL) return(0); @@ -206,24 +207,28 @@ int i2d_RSA_NET(const RSA *a, unsigned char **pp, i = strlen((char *)buf); /* If the key is used for SGC the algorithm is modified a little. */ if(sgckey) { - EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL); + if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL)) + goto err; memcpy(buf + 16, "SGCKEYSALT", 10); i = 26; } - EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL); + if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL)) + goto err; OPENSSL_cleanse(buf,256); /* Encrypt private key in place */ zz = enckey->enckey->digest->data; - EVP_CIPHER_CTX_init(&ctx); - EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL); - EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen); - EVP_EncryptFinal_ex(&ctx,zz + i,&j); - EVP_CIPHER_CTX_cleanup(&ctx); + if (!EVP_EncryptInit_ex(&ctx,EVP_rc4(),NULL,key,NULL)) + goto err; + if (!EVP_EncryptUpdate(&ctx,zz,&i,zz,pkeylen)) + goto err; + if (!EVP_EncryptFinal_ex(&ctx,zz + i,&j)) + goto err; ret = i2d_NETSCAPE_ENCRYPTED_PKEY(enckey, pp); err: + EVP_CIPHER_CTX_cleanup(&ctx); NETSCAPE_ENCRYPTED_PKEY_free(enckey); NETSCAPE_PKEY_free(pkey); return(ret); @@ -288,6 +293,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os, const unsigned char *zz; unsigned char key[EVP_MAX_KEY_LENGTH]; EVP_CIPHER_CTX ctx; + EVP_CIPHER_CTX_init(&ctx); i=cb((char *)buf,256,"Enter Private Key password:",0); if (i != 0) @@ -298,19 +304,22 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os, i = strlen((char *)buf); if(sgckey){ - EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL); + if (!EVP_Digest(buf, i, buf, NULL, EVP_md5(), NULL)) + goto err; memcpy(buf + 16, "SGCKEYSALT", 10); i = 26; } - EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL); + if (!EVP_BytesToKey(EVP_rc4(),EVP_md5(),NULL,buf,i,1,key,NULL)) + goto err; OPENSSL_cleanse(buf,256); - EVP_CIPHER_CTX_init(&ctx); - EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL); - EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length); - EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j); - EVP_CIPHER_CTX_cleanup(&ctx); + if (!EVP_DecryptInit_ex(&ctx,EVP_rc4(),NULL, key,NULL)) + goto err; + if (!EVP_DecryptUpdate(&ctx,os->data,&i,os->data,os->length)) + goto err; + if (!EVP_DecryptFinal_ex(&ctx,&(os->data[i]),&j)) + goto err; os->length=i+j; zz=os->data; @@ -328,6 +337,7 @@ static RSA *d2i_RSA_NET_2(RSA **a, ASN1_OCTET_STRING *os, goto err; } err: + EVP_CIPHER_CTX_cleanup(&ctx); NETSCAPE_PKEY_free(pkey); return(ret); } diff --git a/src/lib/libcrypto/asn1/p5_pbev2.c b/src/lib/libcrypto/asn1/p5_pbev2.c index cb49b6651d..4ea683036b 100644 --- a/src/lib/libcrypto/asn1/p5_pbev2.c +++ b/src/lib/libcrypto/asn1/p5_pbev2.c @@ -91,12 +91,10 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter, unsigned char *aiv, int prf_nid) { X509_ALGOR *scheme = NULL, *kalg = NULL, *ret = NULL; - int alg_nid; + int alg_nid, keylen; EVP_CIPHER_CTX ctx; unsigned char iv[EVP_MAX_IV_LENGTH]; - PBKDF2PARAM *kdf = NULL; PBE2PARAM *pbe2 = NULL; - ASN1_OCTET_STRING *osalt = NULL; ASN1_OBJECT *obj; alg_nid = EVP_CIPHER_type(cipher); @@ -127,7 +125,8 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter, EVP_CIPHER_CTX_init(&ctx); /* Dummy cipherinit to just setup the IV, and PRF */ - EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0); + if (!EVP_CipherInit_ex(&ctx, cipher, NULL, NULL, iv, 0)) + goto err; if(EVP_CIPHER_param_to_asn1(&ctx, scheme->parameter) < 0) { ASN1err(ASN1_F_PKCS5_PBE2_SET_IV, ASN1_R_ERROR_SETTING_CIPHER_PARAMS); @@ -145,55 +144,21 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter, } EVP_CIPHER_CTX_cleanup(&ctx); - if(!(kdf = PBKDF2PARAM_new())) goto merr; - if(!(osalt = M_ASN1_OCTET_STRING_new())) goto merr; - - if (!saltlen) saltlen = PKCS5_SALT_LEN; - if (!(osalt->data = OPENSSL_malloc (saltlen))) goto merr; - osalt->length = saltlen; - if (salt) memcpy (osalt->data, salt, saltlen); - else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) goto merr; - - if(iter <= 0) iter = PKCS5_DEFAULT_ITER; - if(!ASN1_INTEGER_set(kdf->iter, iter)) goto merr; - - /* Now include salt in kdf structure */ - kdf->salt->value.octet_string = osalt; - kdf->salt->type = V_ASN1_OCTET_STRING; - osalt = NULL; - /* If its RC2 then we'd better setup the key length */ - if(alg_nid == NID_rc2_cbc) { - if(!(kdf->keylength = M_ASN1_INTEGER_new())) goto merr; - if(!ASN1_INTEGER_set (kdf->keylength, - EVP_CIPHER_key_length(cipher))) goto merr; - } - - /* prf can stay NULL if we are using hmacWithSHA1 */ - if (prf_nid != NID_hmacWithSHA1) - { - kdf->prf = X509_ALGOR_new(); - if (!kdf->prf) - goto merr; - X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid), - V_ASN1_NULL, NULL); - } - - /* Now setup the PBE2PARAM keyfunc structure */ + if(alg_nid == NID_rc2_cbc) + keylen = EVP_CIPHER_key_length(cipher); + else + keylen = -1; - pbe2->keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2); + /* Setup keyfunc */ - /* Encode PBKDF2PARAM into parameter of pbe2 */ + X509_ALGOR_free(pbe2->keyfunc); - if(!(pbe2->keyfunc->parameter = ASN1_TYPE_new())) goto merr; + pbe2->keyfunc = PKCS5_pbkdf2_set(iter, salt, saltlen, prf_nid, keylen); - if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM), - &pbe2->keyfunc->parameter->value.sequence)) goto merr; - pbe2->keyfunc->parameter->type = V_ASN1_SEQUENCE; - - PBKDF2PARAM_free(kdf); - kdf = NULL; + if (!pbe2->keyfunc) + goto merr; /* Now set up top level AlgorithmIdentifier */ @@ -219,8 +184,6 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter, err: PBE2PARAM_free(pbe2); /* Note 'scheme' is freed as part of pbe2 */ - M_ASN1_OCTET_STRING_free(osalt); - PBKDF2PARAM_free(kdf); X509_ALGOR_free(kalg); X509_ALGOR_free(ret); @@ -233,3 +196,85 @@ X509_ALGOR *PKCS5_pbe2_set(const EVP_CIPHER *cipher, int iter, { return PKCS5_pbe2_set_iv(cipher, iter, salt, saltlen, NULL, -1); } + +X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen, + int prf_nid, int keylen) + { + X509_ALGOR *keyfunc = NULL; + PBKDF2PARAM *kdf = NULL; + ASN1_OCTET_STRING *osalt = NULL; + + if(!(kdf = PBKDF2PARAM_new())) + goto merr; + if(!(osalt = M_ASN1_OCTET_STRING_new())) + goto merr; + + kdf->salt->value.octet_string = osalt; + kdf->salt->type = V_ASN1_OCTET_STRING; + + if (!saltlen) + saltlen = PKCS5_SALT_LEN; + if (!(osalt->data = OPENSSL_malloc (saltlen))) + goto merr; + + osalt->length = saltlen; + + if (salt) + memcpy (osalt->data, salt, saltlen); + else if (RAND_pseudo_bytes (osalt->data, saltlen) < 0) + goto merr; + + if(iter <= 0) + iter = PKCS5_DEFAULT_ITER; + + if(!ASN1_INTEGER_set(kdf->iter, iter)) + goto merr; + + /* If have a key len set it up */ + + if(keylen > 0) + { + if(!(kdf->keylength = M_ASN1_INTEGER_new())) + goto merr; + if(!ASN1_INTEGER_set (kdf->keylength, keylen)) + goto merr; + } + + /* prf can stay NULL if we are using hmacWithSHA1 */ + if (prf_nid > 0 && prf_nid != NID_hmacWithSHA1) + { + kdf->prf = X509_ALGOR_new(); + if (!kdf->prf) + goto merr; + X509_ALGOR_set0(kdf->prf, OBJ_nid2obj(prf_nid), + V_ASN1_NULL, NULL); + } + + /* Finally setup the keyfunc structure */ + + keyfunc = X509_ALGOR_new(); + if (!keyfunc) + goto merr; + + keyfunc->algorithm = OBJ_nid2obj(NID_id_pbkdf2); + + /* Encode PBKDF2PARAM into parameter of pbe2 */ + + if(!(keyfunc->parameter = ASN1_TYPE_new())) + goto merr; + + if(!ASN1_item_pack(kdf, ASN1_ITEM_rptr(PBKDF2PARAM), + &keyfunc->parameter->value.sequence)) + goto merr; + keyfunc->parameter->type = V_ASN1_SEQUENCE; + + PBKDF2PARAM_free(kdf); + return keyfunc; + + merr: + ASN1err(ASN1_F_PKCS5_PBKDF2_SET,ERR_R_MALLOC_FAILURE); + PBKDF2PARAM_free(kdf); + X509_ALGOR_free(keyfunc); + return NULL; + } + diff --git a/src/lib/libcrypto/asn1/t_crl.c b/src/lib/libcrypto/asn1/t_crl.c index ee5a687ce8..c61169208a 100644 --- a/src/lib/libcrypto/asn1/t_crl.c +++ b/src/lib/libcrypto/asn1/t_crl.c @@ -94,8 +94,7 @@ int X509_CRL_print(BIO *out, X509_CRL *x) l = X509_CRL_get_version(x); BIO_printf(out, "%8sVersion %lu (0x%lx)\n", "", l+1, l); i = OBJ_obj2nid(x->sig_alg->algorithm); - BIO_printf(out, "%8sSignature Algorithm: %s\n", "", - (i == NID_undef) ? "NONE" : OBJ_nid2ln(i)); + X509_signature_print(out, x->sig_alg, NULL); p=X509_NAME_oneline(X509_CRL_get_issuer(x),NULL,0); BIO_printf(out,"%8sIssuer: %s\n","",p); OPENSSL_free(p); diff --git a/src/lib/libcrypto/asn1/t_x509.c b/src/lib/libcrypto/asn1/t_x509.c index e061f2ffad..edbb39a02f 100644 --- a/src/lib/libcrypto/asn1/t_x509.c +++ b/src/lib/libcrypto/asn1/t_x509.c @@ -72,6 +72,7 @@ #include #include #include +#include "asn1_locl.h" #ifndef OPENSSL_NO_FP_API int X509_print_fp(FILE *fp, X509 *x) @@ -137,10 +138,10 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag) if (BIO_write(bp," Serial Number:",22) <= 0) goto err; bs=X509_get_serialNumber(x); - if (bs->length <= 4) + if (bs->length <= (int)sizeof(long)) { l=ASN1_INTEGER_get(bs); - if (l < 0) + if (bs->type == V_ASN1_NEG_INTEGER) { l= -l; neg="-"; @@ -167,12 +168,16 @@ int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, unsigned long cflag) if(!(cflag & X509_FLAG_NO_SIGNAME)) { + if(X509_signature_print(bp, x->sig_alg, NULL) <= 0) + goto err; +#if 0 if (BIO_printf(bp,"%8sSignature Algorithm: ","") <= 0) goto err; if (i2a_ASN1_OBJECT(bp, ci->signature->algorithm) <= 0) goto err; if (BIO_puts(bp, "\n") <= 0) goto err; +#endif } if(!(cflag & X509_FLAG_NO_ISSUER)) @@ -255,7 +260,8 @@ int X509_ocspid_print (BIO *bp, X509 *x) goto err; i2d_X509_NAME(x->cert_info->subject, &dertmp); - EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL); + if (!EVP_Digest(der, derlen, SHA1md, NULL, EVP_sha1(), NULL)) + goto err; for (i=0; i < SHA_DIGEST_LENGTH; i++) { if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) goto err; @@ -268,8 +274,10 @@ int X509_ocspid_print (BIO *bp, X509 *x) if (BIO_printf(bp,"\n Public key OCSP hash: ") <= 0) goto err; - EVP_Digest(x->cert_info->key->public_key->data, - x->cert_info->key->public_key->length, SHA1md, NULL, EVP_sha1(), NULL); + if (!EVP_Digest(x->cert_info->key->public_key->data, + x->cert_info->key->public_key->length, + SHA1md, NULL, EVP_sha1(), NULL)) + goto err; for (i=0; i < SHA_DIGEST_LENGTH; i++) { if (BIO_printf(bp,"%02X",SHA1md[i]) <= 0) @@ -283,23 +291,50 @@ err: return(0); } -int X509_signature_print(BIO *bp, X509_ALGOR *sigalg, ASN1_STRING *sig) +int X509_signature_dump(BIO *bp, const ASN1_STRING *sig, int indent) { - unsigned char *s; + const unsigned char *s; int i, n; - if (BIO_puts(bp," Signature Algorithm: ") <= 0) return 0; - if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) return 0; n=sig->length; s=sig->data; for (i=0; ialgorithm) <= 0) return 0; + + sig_nid = OBJ_obj2nid(sigalg->algorithm); + if (sig_nid != NID_undef) + { + int pkey_nid, dig_nid; + const EVP_PKEY_ASN1_METHOD *ameth; + if (OBJ_find_sigid_algs(sig_nid, &dig_nid, &pkey_nid)) + { + ameth = EVP_PKEY_asn1_find(NULL, pkey_nid); + if (ameth && ameth->sig_print) + return ameth->sig_print(bp, sigalg, sig, 9, 0); + } + } + if (sig) + return X509_signature_dump(bp, sig, 9); + else if (BIO_puts(bp, "\n") <= 0) + return 0; return 1; } diff --git a/src/lib/libcrypto/asn1/tasn_prn.c b/src/lib/libcrypto/asn1/tasn_prn.c index 453698012d..542a091a66 100644 --- a/src/lib/libcrypto/asn1/tasn_prn.c +++ b/src/lib/libcrypto/asn1/tasn_prn.c @@ -446,11 +446,11 @@ static int asn1_print_fsname(BIO *out, int indent, return 1; } -static int asn1_print_boolean_ctx(BIO *out, const int bool, +static int asn1_print_boolean_ctx(BIO *out, int boolval, const ASN1_PCTX *pctx) { const char *str; - switch (bool) + switch (boolval) { case -1: str = "BOOL ABSENT"; @@ -574,10 +574,10 @@ static int asn1_primitive_print(BIO *out, ASN1_VALUE **fld, { case V_ASN1_BOOLEAN: { - int bool = *(int *)fld; - if (bool == -1) - bool = it->size; - ret = asn1_print_boolean_ctx(out, bool, pctx); + int boolval = *(int *)fld; + if (boolval == -1) + boolval = it->size; + ret = asn1_print_boolean_ctx(out, boolval, pctx); } break; diff --git a/src/lib/libcrypto/asn1/x_algor.c b/src/lib/libcrypto/asn1/x_algor.c index 99e53429b7..274e456c73 100644 --- a/src/lib/libcrypto/asn1/x_algor.c +++ b/src/lib/libcrypto/asn1/x_algor.c @@ -128,3 +128,17 @@ void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval, } } +/* Set up an X509_ALGOR DigestAlgorithmIdentifier from an EVP_MD */ + +void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md) + { + int param_type; + + if (md->flags & EVP_MD_FLAG_DIGALGID_ABSENT) + param_type = V_ASN1_UNDEF; + else + param_type = V_ASN1_NULL; + + X509_ALGOR_set0(alg, OBJ_nid2obj(EVP_MD_type(md)), param_type, NULL); + + } diff --git a/src/lib/libcrypto/asn1/x_name.c b/src/lib/libcrypto/asn1/x_name.c index 49be08b4da..d7c2318693 100644 --- a/src/lib/libcrypto/asn1/x_name.c +++ b/src/lib/libcrypto/asn1/x_name.c @@ -399,8 +399,7 @@ static int asn1_string_canon(ASN1_STRING *out, ASN1_STRING *in) /* If type not in bitmask just copy string across */ if (!(ASN1_tag2bit(in->type) & ASN1_MASK_CANON)) { - out->type = in->type; - if (!ASN1_STRING_set(out, in->data, in->length)) + if (!ASN1_STRING_copy(out, in)) return 0; return 1; } diff --git a/src/lib/libcrypto/asn1/x_pubkey.c b/src/lib/libcrypto/asn1/x_pubkey.c index d42b6a2c54..627ec87f9f 100644 --- a/src/lib/libcrypto/asn1/x_pubkey.c +++ b/src/lib/libcrypto/asn1/x_pubkey.c @@ -171,7 +171,16 @@ EVP_PKEY *X509_PUBKEY_get(X509_PUBKEY *key) goto error; } - key->pkey = ret; + /* Check to see if another thread set key->pkey first */ + CRYPTO_w_lock(CRYPTO_LOCK_EVP_PKEY); + if (key->pkey) + { + EVP_PKEY_free(ret); + ret = key->pkey; + } + else + key->pkey = ret; + CRYPTO_w_unlock(CRYPTO_LOCK_EVP_PKEY); CRYPTO_add(&ret->references, 1, CRYPTO_LOCK_EVP_PKEY); return ret; diff --git a/src/lib/libcrypto/bf/bf_skey.c b/src/lib/libcrypto/bf/bf_skey.c index 3673cdee6e..3b0bca41ae 100644 --- a/src/lib/libcrypto/bf/bf_skey.c +++ b/src/lib/libcrypto/bf/bf_skey.c @@ -58,11 +58,19 @@ #include #include +#include #include #include "bf_locl.h" #include "bf_pi.h" void BF_set_key(BF_KEY *key, int len, const unsigned char *data) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(BLOWFISH); + private_BF_set_key(key, len, data); + } +void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data) +#endif { int i; BF_LONG *p,ri,in[2]; diff --git a/src/lib/libcrypto/bf/blowfish.h b/src/lib/libcrypto/bf/blowfish.h index b97e76f9a3..4b6c8920a4 100644 --- a/src/lib/libcrypto/bf/blowfish.h +++ b/src/lib/libcrypto/bf/blowfish.h @@ -104,7 +104,9 @@ typedef struct bf_key_st BF_LONG S[4*256]; } BF_KEY; - +#ifdef OPENSSL_FIPS +void private_BF_set_key(BF_KEY *key, int len, const unsigned char *data); +#endif void BF_set_key(BF_KEY *key, int len, const unsigned char *data); void BF_encrypt(BF_LONG *data,const BF_KEY *key); diff --git a/src/lib/libcrypto/bio/b_sock.c b/src/lib/libcrypto/bio/b_sock.c index d47310d650..41f958be71 100644 --- a/src/lib/libcrypto/bio/b_sock.c +++ b/src/lib/libcrypto/bio/b_sock.c @@ -960,7 +960,6 @@ int BIO_set_tcp_ndelay(int s, int on) #endif return(ret == 0); } -#endif int BIO_socket_nbio(int s, int mode) { @@ -973,3 +972,4 @@ int BIO_socket_nbio(int s, int mode) #endif return(ret == 0); } +#endif diff --git a/src/lib/libcrypto/bio/bio.h b/src/lib/libcrypto/bio/bio.h index ab47abcf14..05699ab212 100644 --- a/src/lib/libcrypto/bio/bio.h +++ b/src/lib/libcrypto/bio/bio.h @@ -68,6 +68,14 @@ #include +#ifndef OPENSSL_NO_SCTP +# ifndef OPENSSL_SYS_VMS +# include +# else +# include +# endif +#endif + #ifdef __cplusplus extern "C" { #endif @@ -95,6 +103,9 @@ extern "C" { #define BIO_TYPE_BIO (19|0x0400) /* (half a) BIO pair */ #define BIO_TYPE_LINEBUFFER (20|0x0200) /* filter */ #define BIO_TYPE_DGRAM (21|0x0400|0x0100) +#ifndef OPENSSL_NO_SCTP +#define BIO_TYPE_DGRAM_SCTP (24|0x0400|0x0100) +#endif #define BIO_TYPE_ASN1 (22|0x0200) /* filter */ #define BIO_TYPE_COMP (23|0x0200) /* filter */ @@ -146,6 +157,7 @@ extern "C" { /* #endif */ #define BIO_CTRL_DGRAM_QUERY_MTU 40 /* as kernel for current MTU */ +#define BIO_CTRL_DGRAM_GET_FALLBACK_MTU 47 #define BIO_CTRL_DGRAM_GET_MTU 41 /* get cached value for MTU */ #define BIO_CTRL_DGRAM_SET_MTU 42 /* set cached value for * MTU. want to use this @@ -161,7 +173,22 @@ extern "C" { #define BIO_CTRL_DGRAM_SET_PEER 44 /* Destination for the data */ #define BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT 45 /* Next DTLS handshake timeout to - * adjust socket timeouts */ + * adjust socket timeouts */ + +#ifndef OPENSSL_NO_SCTP +/* SCTP stuff */ +#define BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE 50 +#define BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY 51 +#define BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY 52 +#define BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD 53 +#define BIO_CTRL_DGRAM_SCTP_GET_SNDINFO 60 +#define BIO_CTRL_DGRAM_SCTP_SET_SNDINFO 61 +#define BIO_CTRL_DGRAM_SCTP_GET_RCVINFO 62 +#define BIO_CTRL_DGRAM_SCTP_SET_RCVINFO 63 +#define BIO_CTRL_DGRAM_SCTP_GET_PRINFO 64 +#define BIO_CTRL_DGRAM_SCTP_SET_PRINFO 65 +#define BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN 70 +#endif /* modifiers */ #define BIO_FP_READ 0x02 @@ -331,6 +358,34 @@ typedef struct bio_f_buffer_ctx_struct /* Prefix and suffix callback in ASN1 BIO */ typedef int asn1_ps_func(BIO *b, unsigned char **pbuf, int *plen, void *parg); +#ifndef OPENSSL_NO_SCTP +/* SCTP parameter structs */ +struct bio_dgram_sctp_sndinfo + { + uint16_t snd_sid; + uint16_t snd_flags; + uint32_t snd_ppid; + uint32_t snd_context; + }; + +struct bio_dgram_sctp_rcvinfo + { + uint16_t rcv_sid; + uint16_t rcv_ssn; + uint16_t rcv_flags; + uint32_t rcv_ppid; + uint32_t rcv_tsn; + uint32_t rcv_cumtsn; + uint32_t rcv_context; + }; + +struct bio_dgram_sctp_prinfo + { + uint16_t pr_policy; + uint32_t pr_value; + }; +#endif + /* connect BIO stuff */ #define BIO_CONN_S_BEFORE 1 #define BIO_CONN_S_GET_IP 2 @@ -628,6 +683,9 @@ BIO_METHOD *BIO_f_linebuffer(void); BIO_METHOD *BIO_f_nbio_test(void); #ifndef OPENSSL_NO_DGRAM BIO_METHOD *BIO_s_datagram(void); +#ifndef OPENSSL_NO_SCTP +BIO_METHOD *BIO_s_datagram_sctp(void); +#endif #endif /* BIO_METHOD *BIO_f_ber(void); */ @@ -670,6 +728,15 @@ int BIO_set_tcp_ndelay(int sock,int turn_on); BIO *BIO_new_socket(int sock, int close_flag); BIO *BIO_new_dgram(int fd, int close_flag); +#ifndef OPENSSL_NO_SCTP +BIO *BIO_new_dgram_sctp(int fd, int close_flag); +int BIO_dgram_is_sctp(BIO *bio); +int BIO_dgram_sctp_notification_cb(BIO *b, + void (*handle_notifications)(BIO *bio, void *context, void *buf), + void *context); +int BIO_dgram_sctp_wait_for_dry(BIO *b); +int BIO_dgram_sctp_msg_waiting(BIO *b); +#endif BIO *BIO_new_fd(int fd, int close_flag); BIO *BIO_new_connect(char *host_port); BIO *BIO_new_accept(char *host_port); @@ -734,6 +801,7 @@ void ERR_load_BIO_strings(void); #define BIO_F_BUFFER_CTRL 114 #define BIO_F_CONN_CTRL 127 #define BIO_F_CONN_STATE 115 +#define BIO_F_DGRAM_SCTP_READ 132 #define BIO_F_FILE_CTRL 116 #define BIO_F_FILE_READ 130 #define BIO_F_LINEBUFFER_CTRL 129 diff --git a/src/lib/libcrypto/bio/bio_err.c b/src/lib/libcrypto/bio/bio_err.c index a224edd5a0..0dbfbd80d3 100644 --- a/src/lib/libcrypto/bio/bio_err.c +++ b/src/lib/libcrypto/bio/bio_err.c @@ -1,6 +1,6 @@ /* crypto/bio/bio_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -95,6 +95,7 @@ static ERR_STRING_DATA BIO_str_functs[]= {ERR_FUNC(BIO_F_BUFFER_CTRL), "BUFFER_CTRL"}, {ERR_FUNC(BIO_F_CONN_CTRL), "CONN_CTRL"}, {ERR_FUNC(BIO_F_CONN_STATE), "CONN_STATE"}, +{ERR_FUNC(BIO_F_DGRAM_SCTP_READ), "DGRAM_SCTP_READ"}, {ERR_FUNC(BIO_F_FILE_CTRL), "FILE_CTRL"}, {ERR_FUNC(BIO_F_FILE_READ), "FILE_READ"}, {ERR_FUNC(BIO_F_LINEBUFFER_CTRL), "LINEBUFFER_CTRL"}, diff --git a/src/lib/libcrypto/bio/bio_lib.c b/src/lib/libcrypto/bio/bio_lib.c index e12bc3a2ca..9c9646afa8 100644 --- a/src/lib/libcrypto/bio/bio_lib.c +++ b/src/lib/libcrypto/bio/bio_lib.c @@ -521,40 +521,40 @@ void BIO_free_all(BIO *bio) BIO *BIO_dup_chain(BIO *in) { - BIO *ret=NULL,*eoc=NULL,*bio,*new; + BIO *ret=NULL,*eoc=NULL,*bio,*new_bio; for (bio=in; bio != NULL; bio=bio->next_bio) { - if ((new=BIO_new(bio->method)) == NULL) goto err; - new->callback=bio->callback; - new->cb_arg=bio->cb_arg; - new->init=bio->init; - new->shutdown=bio->shutdown; - new->flags=bio->flags; + if ((new_bio=BIO_new(bio->method)) == NULL) goto err; + new_bio->callback=bio->callback; + new_bio->cb_arg=bio->cb_arg; + new_bio->init=bio->init; + new_bio->shutdown=bio->shutdown; + new_bio->flags=bio->flags; /* This will let SSL_s_sock() work with stdin/stdout */ - new->num=bio->num; + new_bio->num=bio->num; - if (!BIO_dup_state(bio,(char *)new)) + if (!BIO_dup_state(bio,(char *)new_bio)) { - BIO_free(new); + BIO_free(new_bio); goto err; } /* copy app data */ - if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new->ex_data, + if (!CRYPTO_dup_ex_data(CRYPTO_EX_INDEX_BIO, &new_bio->ex_data, &bio->ex_data)) goto err; if (ret == NULL) { - eoc=new; + eoc=new_bio; ret=eoc; } else { - BIO_push(eoc,new); - eoc=new; + BIO_push(eoc,new_bio); + eoc=new_bio; } } return(ret); diff --git a/src/lib/libcrypto/bio/bss_bio.c b/src/lib/libcrypto/bio/bss_bio.c index 76bd48e767..52ef0ebcb3 100644 --- a/src/lib/libcrypto/bio/bss_bio.c +++ b/src/lib/libcrypto/bio/bss_bio.c @@ -277,10 +277,10 @@ static int bio_read(BIO *bio, char *buf, int size_) */ /* WARNING: The non-copying interface is largely untested as of yet * and may contain bugs. */ -static ssize_t bio_nread0(BIO *bio, char **buf) +static ossl_ssize_t bio_nread0(BIO *bio, char **buf) { struct bio_bio_st *b, *peer_b; - ssize_t num; + ossl_ssize_t num; BIO_clear_retry_flags(bio); @@ -315,15 +315,15 @@ static ssize_t bio_nread0(BIO *bio, char **buf) return num; } -static ssize_t bio_nread(BIO *bio, char **buf, size_t num_) +static ossl_ssize_t bio_nread(BIO *bio, char **buf, size_t num_) { struct bio_bio_st *b, *peer_b; - ssize_t num, available; + ossl_ssize_t num, available; if (num_ > SSIZE_MAX) num = SSIZE_MAX; else - num = (ssize_t)num_; + num = (ossl_ssize_t)num_; available = bio_nread0(bio, buf); if (num > available) @@ -428,7 +428,7 @@ static int bio_write(BIO *bio, const char *buf, int num_) * (example usage: bio_nwrite0(), write to buffer, bio_nwrite() * or just bio_nwrite(), write to buffer) */ -static ssize_t bio_nwrite0(BIO *bio, char **buf) +static ossl_ssize_t bio_nwrite0(BIO *bio, char **buf) { struct bio_bio_st *b; size_t num; @@ -476,15 +476,15 @@ static ssize_t bio_nwrite0(BIO *bio, char **buf) return num; } -static ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_) +static ossl_ssize_t bio_nwrite(BIO *bio, char **buf, size_t num_) { struct bio_bio_st *b; - ssize_t num, space; + ossl_ssize_t num, space; if (num_ > SSIZE_MAX) num = SSIZE_MAX; else - num = (ssize_t)num_; + num = (ossl_ssize_t)num_; space = bio_nwrite0(bio, buf); if (num > space) diff --git a/src/lib/libcrypto/bio/bss_dgram.c b/src/lib/libcrypto/bio/bss_dgram.c index 71ebe987b6..1b1e4bec81 100644 --- a/src/lib/libcrypto/bio/bss_dgram.c +++ b/src/lib/libcrypto/bio/bss_dgram.c @@ -70,6 +70,13 @@ #include #endif +#ifndef OPENSSL_NO_SCTP +#include +#include +#define OPENSSL_SCTP_DATA_CHUNK_TYPE 0x00 +#define OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE 0xc0 +#endif + #ifdef OPENSSL_SYS_LINUX #define IP_MTU 14 /* linux is lame */ #endif @@ -88,6 +95,18 @@ static int dgram_new(BIO *h); static int dgram_free(BIO *data); static int dgram_clear(BIO *bio); +#ifndef OPENSSL_NO_SCTP +static int dgram_sctp_write(BIO *h, const char *buf, int num); +static int dgram_sctp_read(BIO *h, char *buf, int size); +static int dgram_sctp_puts(BIO *h, const char *str); +static long dgram_sctp_ctrl(BIO *h, int cmd, long arg1, void *arg2); +static int dgram_sctp_new(BIO *h); +static int dgram_sctp_free(BIO *data); +#ifdef SCTP_AUTHENTICATION_EVENT +static void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp); +#endif +#endif + static int BIO_dgram_should_retry(int s); static void get_current_time(struct timeval *t); @@ -106,6 +125,22 @@ static BIO_METHOD methods_dgramp= NULL, }; +#ifndef OPENSSL_NO_SCTP +static BIO_METHOD methods_dgramp_sctp= + { + BIO_TYPE_DGRAM_SCTP, + "datagram sctp socket", + dgram_sctp_write, + dgram_sctp_read, + dgram_sctp_puts, + NULL, /* dgram_gets, */ + dgram_sctp_ctrl, + dgram_sctp_new, + dgram_sctp_free, + NULL, + }; +#endif + typedef struct bio_dgram_data_st { union { @@ -122,6 +157,40 @@ typedef struct bio_dgram_data_st struct timeval socket_timeout; } bio_dgram_data; +#ifndef OPENSSL_NO_SCTP +typedef struct bio_dgram_sctp_save_message_st + { + BIO *bio; + char *data; + int length; + } bio_dgram_sctp_save_message; + +typedef struct bio_dgram_sctp_data_st + { + union { + struct sockaddr sa; + struct sockaddr_in sa_in; +#if OPENSSL_USE_IPV6 + struct sockaddr_in6 sa_in6; +#endif + } peer; + unsigned int connected; + unsigned int _errno; + unsigned int mtu; + struct bio_dgram_sctp_sndinfo sndinfo; + struct bio_dgram_sctp_rcvinfo rcvinfo; + struct bio_dgram_sctp_prinfo prinfo; + void (*handle_notifications)(BIO *bio, void *context, void *buf); + void* notification_context; + int in_handshake; + int ccs_rcvd; + int ccs_sent; + int save_shutdown; + int peer_auth_tested; + bio_dgram_sctp_save_message saved_message; + } bio_dgram_sctp_data; +#endif + BIO_METHOD *BIO_s_datagram(void) { return(&methods_dgramp); @@ -547,6 +616,27 @@ static long dgram_ctrl(BIO *b, int cmd, long num, void *ptr) ret = 0; #endif break; + case BIO_CTRL_DGRAM_GET_FALLBACK_MTU: + switch (data->peer.sa.sa_family) + { + case AF_INET: + ret = 576 - 20 - 8; + break; +#if OPENSSL_USE_IPV6 + case AF_INET6: +#ifdef IN6_IS_ADDR_V4MAPPED + if (IN6_IS_ADDR_V4MAPPED(&data->peer.sa_in6.sin6_addr)) + ret = 576 - 20 - 8; + else +#endif + ret = 1280 - 40 - 8; + break; +#endif + default: + ret = 576 - 20 - 8; + break; + } + break; case BIO_CTRL_DGRAM_GET_MTU: return data->mtu; break; @@ -738,6 +828,912 @@ static int dgram_puts(BIO *bp, const char *str) return(ret); } +#ifndef OPENSSL_NO_SCTP +BIO_METHOD *BIO_s_datagram_sctp(void) + { + return(&methods_dgramp_sctp); + } + +BIO *BIO_new_dgram_sctp(int fd, int close_flag) + { + BIO *bio; + int ret, optval = 20000; + int auth_data = 0, auth_forward = 0; + unsigned char *p; + struct sctp_authchunk auth; + struct sctp_authchunks *authchunks; + socklen_t sockopt_len; +#ifdef SCTP_AUTHENTICATION_EVENT +#ifdef SCTP_EVENT + struct sctp_event event; +#else + struct sctp_event_subscribe event; +#endif +#endif + + bio=BIO_new(BIO_s_datagram_sctp()); + if (bio == NULL) return(NULL); + BIO_set_fd(bio,fd,close_flag); + + /* Activate SCTP-AUTH for DATA and FORWARD-TSN chunks */ + auth.sauth_chunk = OPENSSL_SCTP_DATA_CHUNK_TYPE; + ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk)); + OPENSSL_assert(ret >= 0); + auth.sauth_chunk = OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE; + ret = setsockopt(fd, IPPROTO_SCTP, SCTP_AUTH_CHUNK, &auth, sizeof(struct sctp_authchunk)); + OPENSSL_assert(ret >= 0); + + /* Test if activation was successful. When using accept(), + * SCTP-AUTH has to be activated for the listening socket + * already, otherwise the connected socket won't use it. */ + sockopt_len = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t)); + authchunks = OPENSSL_malloc(sockopt_len); + memset(authchunks, 0, sizeof(sockopt_len)); + ret = getsockopt(fd, IPPROTO_SCTP, SCTP_LOCAL_AUTH_CHUNKS, authchunks, &sockopt_len); + OPENSSL_assert(ret >= 0); + + for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t); + p < (unsigned char*) authchunks + sockopt_len; + p += sizeof(uint8_t)) + { + if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1; + if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1; + } + + OPENSSL_free(authchunks); + + OPENSSL_assert(auth_data); + OPENSSL_assert(auth_forward); + +#ifdef SCTP_AUTHENTICATION_EVENT +#ifdef SCTP_EVENT + memset(&event, 0, sizeof(struct sctp_event)); + event.se_assoc_id = 0; + event.se_type = SCTP_AUTHENTICATION_EVENT; + event.se_on = 1; + ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event)); + OPENSSL_assert(ret >= 0); +#else + sockopt_len = (socklen_t) sizeof(struct sctp_event_subscribe); + ret = getsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, &sockopt_len); + OPENSSL_assert(ret >= 0); + + event.sctp_authentication_event = 1; + + ret = setsockopt(fd, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe)); + OPENSSL_assert(ret >= 0); +#endif +#endif + + /* Disable partial delivery by setting the min size + * larger than the max record size of 2^14 + 2048 + 13 + */ + ret = setsockopt(fd, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, &optval, sizeof(optval)); + OPENSSL_assert(ret >= 0); + + return(bio); + } + +int BIO_dgram_is_sctp(BIO *bio) + { + return (BIO_method_type(bio) == BIO_TYPE_DGRAM_SCTP); + } + +static int dgram_sctp_new(BIO *bi) + { + bio_dgram_sctp_data *data = NULL; + + bi->init=0; + bi->num=0; + data = OPENSSL_malloc(sizeof(bio_dgram_sctp_data)); + if (data == NULL) + return 0; + memset(data, 0x00, sizeof(bio_dgram_sctp_data)); +#ifdef SCTP_PR_SCTP_NONE + data->prinfo.pr_policy = SCTP_PR_SCTP_NONE; +#endif + bi->ptr = data; + + bi->flags=0; + return(1); + } + +static int dgram_sctp_free(BIO *a) + { + bio_dgram_sctp_data *data; + + if (a == NULL) return(0); + if ( ! dgram_clear(a)) + return 0; + + data = (bio_dgram_sctp_data *)a->ptr; + if(data != NULL) OPENSSL_free(data); + + return(1); + } + +#ifdef SCTP_AUTHENTICATION_EVENT +void dgram_sctp_handle_auth_free_key_event(BIO *b, union sctp_notification *snp) + { + unsigned int sockopt_len = 0; + int ret; + struct sctp_authkey_event* authkeyevent = &snp->sn_auth_event; + + if (authkeyevent->auth_indication == SCTP_AUTH_FREE_KEY) + { + struct sctp_authkeyid authkeyid; + + /* delete key */ + authkeyid.scact_keynumber = authkeyevent->auth_keynumber; + sockopt_len = sizeof(struct sctp_authkeyid); + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY, + &authkeyid, sockopt_len); + } + } +#endif + +static int dgram_sctp_read(BIO *b, char *out, int outl) + { + int ret = 0, n = 0, i, optval; + socklen_t optlen; + bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr; + union sctp_notification *snp; + struct msghdr msg; + struct iovec iov; + struct cmsghdr *cmsg; + char cmsgbuf[512]; + + if (out != NULL) + { + clear_socket_error(); + + do + { + memset(&data->rcvinfo, 0x00, sizeof(struct bio_dgram_sctp_rcvinfo)); + iov.iov_base = out; + iov.iov_len = outl; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = cmsgbuf; + msg.msg_controllen = 512; + msg.msg_flags = 0; + n = recvmsg(b->num, &msg, 0); + + if (msg.msg_controllen > 0) + { + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) + { + if (cmsg->cmsg_level != IPPROTO_SCTP) + continue; +#ifdef SCTP_RCVINFO + if (cmsg->cmsg_type == SCTP_RCVINFO) + { + struct sctp_rcvinfo *rcvinfo; + + rcvinfo = (struct sctp_rcvinfo *)CMSG_DATA(cmsg); + data->rcvinfo.rcv_sid = rcvinfo->rcv_sid; + data->rcvinfo.rcv_ssn = rcvinfo->rcv_ssn; + data->rcvinfo.rcv_flags = rcvinfo->rcv_flags; + data->rcvinfo.rcv_ppid = rcvinfo->rcv_ppid; + data->rcvinfo.rcv_tsn = rcvinfo->rcv_tsn; + data->rcvinfo.rcv_cumtsn = rcvinfo->rcv_cumtsn; + data->rcvinfo.rcv_context = rcvinfo->rcv_context; + } +#endif +#ifdef SCTP_SNDRCV + if (cmsg->cmsg_type == SCTP_SNDRCV) + { + struct sctp_sndrcvinfo *sndrcvinfo; + + sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + data->rcvinfo.rcv_sid = sndrcvinfo->sinfo_stream; + data->rcvinfo.rcv_ssn = sndrcvinfo->sinfo_ssn; + data->rcvinfo.rcv_flags = sndrcvinfo->sinfo_flags; + data->rcvinfo.rcv_ppid = sndrcvinfo->sinfo_ppid; + data->rcvinfo.rcv_tsn = sndrcvinfo->sinfo_tsn; + data->rcvinfo.rcv_cumtsn = sndrcvinfo->sinfo_cumtsn; + data->rcvinfo.rcv_context = sndrcvinfo->sinfo_context; + } +#endif + } + } + + if (n <= 0) + { + if (n < 0) + ret = n; + break; + } + + if (msg.msg_flags & MSG_NOTIFICATION) + { + snp = (union sctp_notification*) out; + if (snp->sn_header.sn_type == SCTP_SENDER_DRY_EVENT) + { +#ifdef SCTP_EVENT + struct sctp_event event; +#else + struct sctp_event_subscribe event; + socklen_t eventsize; +#endif + /* If a message has been delayed until the socket + * is dry, it can be sent now. + */ + if (data->saved_message.length > 0) + { + dgram_sctp_write(data->saved_message.bio, data->saved_message.data, + data->saved_message.length); + OPENSSL_free(data->saved_message.data); + data->saved_message.length = 0; + } + + /* disable sender dry event */ +#ifdef SCTP_EVENT + memset(&event, 0, sizeof(struct sctp_event)); + event.se_assoc_id = 0; + event.se_type = SCTP_SENDER_DRY_EVENT; + event.se_on = 0; + i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event)); + OPENSSL_assert(i >= 0); +#else + eventsize = sizeof(struct sctp_event_subscribe); + i = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize); + OPENSSL_assert(i >= 0); + + event.sctp_sender_dry_event = 0; + + i = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe)); + OPENSSL_assert(i >= 0); +#endif + } + +#ifdef SCTP_AUTHENTICATION_EVENT + if (snp->sn_header.sn_type == SCTP_AUTHENTICATION_EVENT) + dgram_sctp_handle_auth_free_key_event(b, snp); +#endif + + if (data->handle_notifications != NULL) + data->handle_notifications(b, data->notification_context, (void*) out); + + memset(out, 0, outl); + } + else + ret += n; + } + while ((msg.msg_flags & MSG_NOTIFICATION) && (msg.msg_flags & MSG_EOR) && (ret < outl)); + + if (ret > 0 && !(msg.msg_flags & MSG_EOR)) + { + /* Partial message read, this should never happen! */ + + /* The buffer was too small, this means the peer sent + * a message that was larger than allowed. */ + if (ret == outl) + return -1; + + /* Test if socket buffer can handle max record + * size (2^14 + 2048 + 13) + */ + optlen = (socklen_t) sizeof(int); + ret = getsockopt(b->num, SOL_SOCKET, SO_RCVBUF, &optval, &optlen); + OPENSSL_assert(ret >= 0); + OPENSSL_assert(optval >= 18445); + + /* Test if SCTP doesn't partially deliver below + * max record size (2^14 + 2048 + 13) + */ + optlen = (socklen_t) sizeof(int); + ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_PARTIAL_DELIVERY_POINT, + &optval, &optlen); + OPENSSL_assert(ret >= 0); + OPENSSL_assert(optval >= 18445); + + /* Partially delivered notification??? Probably a bug.... */ + OPENSSL_assert(!(msg.msg_flags & MSG_NOTIFICATION)); + + /* Everything seems ok till now, so it's most likely + * a message dropped by PR-SCTP. + */ + memset(out, 0, outl); + BIO_set_retry_read(b); + return -1; + } + + BIO_clear_retry_flags(b); + if (ret < 0) + { + if (BIO_dgram_should_retry(ret)) + { + BIO_set_retry_read(b); + data->_errno = get_last_socket_error(); + } + } + + /* Test if peer uses SCTP-AUTH before continuing */ + if (!data->peer_auth_tested) + { + int ii, auth_data = 0, auth_forward = 0; + unsigned char *p; + struct sctp_authchunks *authchunks; + + optlen = (socklen_t)(sizeof(sctp_assoc_t) + 256 * sizeof(uint8_t)); + authchunks = OPENSSL_malloc(optlen); + memset(authchunks, 0, sizeof(optlen)); + ii = getsockopt(b->num, IPPROTO_SCTP, SCTP_PEER_AUTH_CHUNKS, authchunks, &optlen); + OPENSSL_assert(ii >= 0); + + for (p = (unsigned char*) authchunks + sizeof(sctp_assoc_t); + p < (unsigned char*) authchunks + optlen; + p += sizeof(uint8_t)) + { + if (*p == OPENSSL_SCTP_DATA_CHUNK_TYPE) auth_data = 1; + if (*p == OPENSSL_SCTP_FORWARD_CUM_TSN_CHUNK_TYPE) auth_forward = 1; + } + + OPENSSL_free(authchunks); + + if (!auth_data || !auth_forward) + { + BIOerr(BIO_F_DGRAM_SCTP_READ,BIO_R_CONNECT_ERROR); + return -1; + } + + data->peer_auth_tested = 1; + } + } + return(ret); + } + +static int dgram_sctp_write(BIO *b, const char *in, int inl) + { + int ret; + bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr; + struct bio_dgram_sctp_sndinfo *sinfo = &(data->sndinfo); + struct bio_dgram_sctp_prinfo *pinfo = &(data->prinfo); + struct bio_dgram_sctp_sndinfo handshake_sinfo; + struct iovec iov[1]; + struct msghdr msg; + struct cmsghdr *cmsg; +#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO) + char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo)) + CMSG_SPACE(sizeof(struct sctp_prinfo))]; + struct sctp_sndinfo *sndinfo; + struct sctp_prinfo *prinfo; +#else + char cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; + struct sctp_sndrcvinfo *sndrcvinfo; +#endif + + clear_socket_error(); + + /* If we're send anything else than application data, + * disable all user parameters and flags. + */ + if (in[0] != 23) { + memset(&handshake_sinfo, 0x00, sizeof(struct bio_dgram_sctp_sndinfo)); +#ifdef SCTP_SACK_IMMEDIATELY + handshake_sinfo.snd_flags = SCTP_SACK_IMMEDIATELY; +#endif + sinfo = &handshake_sinfo; + } + + /* If we have to send a shutdown alert message and the + * socket is not dry yet, we have to save it and send it + * as soon as the socket gets dry. + */ + if (data->save_shutdown && !BIO_dgram_sctp_wait_for_dry(b)) + { + data->saved_message.bio = b; + data->saved_message.length = inl; + data->saved_message.data = OPENSSL_malloc(inl); + memcpy(data->saved_message.data, in, inl); + return inl; + } + + iov[0].iov_base = (char *)in; + iov[0].iov_len = inl; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + msg.msg_control = (caddr_t)cmsgbuf; + msg.msg_controllen = 0; + msg.msg_flags = 0; +#if defined(SCTP_SNDINFO) && defined(SCTP_PRINFO) + cmsg = (struct cmsghdr *)cmsgbuf; + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDINFO; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndinfo)); + sndinfo = (struct sctp_sndinfo *)CMSG_DATA(cmsg); + memset(sndinfo, 0, sizeof(struct sctp_sndinfo)); + sndinfo->snd_sid = sinfo->snd_sid; + sndinfo->snd_flags = sinfo->snd_flags; + sndinfo->snd_ppid = sinfo->snd_ppid; + sndinfo->snd_context = sinfo->snd_context; + msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndinfo)); + + cmsg = (struct cmsghdr *)&cmsgbuf[CMSG_SPACE(sizeof(struct sctp_sndinfo))]; + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_PRINFO; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_prinfo)); + prinfo = (struct sctp_prinfo *)CMSG_DATA(cmsg); + memset(prinfo, 0, sizeof(struct sctp_prinfo)); + prinfo->pr_policy = pinfo->pr_policy; + prinfo->pr_value = pinfo->pr_value; + msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_prinfo)); +#else + cmsg = (struct cmsghdr *)cmsgbuf; + cmsg->cmsg_level = IPPROTO_SCTP; + cmsg->cmsg_type = SCTP_SNDRCV; + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); + sndrcvinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo)); + sndrcvinfo->sinfo_stream = sinfo->snd_sid; + sndrcvinfo->sinfo_flags = sinfo->snd_flags; +#ifdef __FreeBSD__ + sndrcvinfo->sinfo_flags |= pinfo->pr_policy; +#endif + sndrcvinfo->sinfo_ppid = sinfo->snd_ppid; + sndrcvinfo->sinfo_context = sinfo->snd_context; + sndrcvinfo->sinfo_timetolive = pinfo->pr_value; + msg.msg_controllen += CMSG_SPACE(sizeof(struct sctp_sndrcvinfo)); +#endif + + ret = sendmsg(b->num, &msg, 0); + + BIO_clear_retry_flags(b); + if (ret <= 0) + { + if (BIO_dgram_should_retry(ret)) + { + BIO_set_retry_write(b); + data->_errno = get_last_socket_error(); + } + } + return(ret); + } + +static long dgram_sctp_ctrl(BIO *b, int cmd, long num, void *ptr) + { + long ret=1; + bio_dgram_sctp_data *data = NULL; + unsigned int sockopt_len = 0; + struct sctp_authkeyid authkeyid; + struct sctp_authkey *authkey; + + data = (bio_dgram_sctp_data *)b->ptr; + + switch (cmd) + { + case BIO_CTRL_DGRAM_QUERY_MTU: + /* Set to maximum (2^14) + * and ignore user input to enable transport + * protocol fragmentation. + * Returns always 2^14. + */ + data->mtu = 16384; + ret = data->mtu; + break; + case BIO_CTRL_DGRAM_SET_MTU: + /* Set to maximum (2^14) + * and ignore input to enable transport + * protocol fragmentation. + * Returns always 2^14. + */ + data->mtu = 16384; + ret = data->mtu; + break; + case BIO_CTRL_DGRAM_SET_CONNECTED: + case BIO_CTRL_DGRAM_CONNECT: + /* Returns always -1. */ + ret = -1; + break; + case BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT: + /* SCTP doesn't need the DTLS timer + * Returns always 1. + */ + break; + case BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE: + if (num > 0) + data->in_handshake = 1; + else + data->in_handshake = 0; + + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_NODELAY, &data->in_handshake, sizeof(int)); + break; + case BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY: + /* New shared key for SCTP AUTH. + * Returns 0 on success, -1 otherwise. + */ + + /* Get active key */ + sockopt_len = sizeof(struct sctp_authkeyid); + ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len); + if (ret < 0) break; + + /* Add new key */ + sockopt_len = sizeof(struct sctp_authkey) + 64 * sizeof(uint8_t); + authkey = OPENSSL_malloc(sockopt_len); + memset(authkey, 0x00, sockopt_len); + authkey->sca_keynumber = authkeyid.scact_keynumber + 1; +#ifndef __FreeBSD__ + /* This field is missing in FreeBSD 8.2 and earlier, + * and FreeBSD 8.3 and higher work without it. + */ + authkey->sca_keylength = 64; +#endif + memcpy(&authkey->sca_key[0], ptr, 64 * sizeof(uint8_t)); + + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_KEY, authkey, sockopt_len); + if (ret < 0) break; + + /* Reset active key */ + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, + &authkeyid, sizeof(struct sctp_authkeyid)); + if (ret < 0) break; + + break; + case BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY: + /* Returns 0 on success, -1 otherwise. */ + + /* Get active key */ + sockopt_len = sizeof(struct sctp_authkeyid); + ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len); + if (ret < 0) break; + + /* Set active key */ + authkeyid.scact_keynumber = authkeyid.scact_keynumber + 1; + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, + &authkeyid, sizeof(struct sctp_authkeyid)); + if (ret < 0) break; + + /* CCS has been sent, so remember that and fall through + * to check if we need to deactivate an old key + */ + data->ccs_sent = 1; + + case BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD: + /* Returns 0 on success, -1 otherwise. */ + + /* Has this command really been called or is this just a fall-through? */ + if (cmd == BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD) + data->ccs_rcvd = 1; + + /* CSS has been both, received and sent, so deactivate an old key */ + if (data->ccs_rcvd == 1 && data->ccs_sent == 1) + { + /* Get active key */ + sockopt_len = sizeof(struct sctp_authkeyid); + ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_ACTIVE_KEY, &authkeyid, &sockopt_len); + if (ret < 0) break; + + /* Deactivate key or delete second last key if + * SCTP_AUTHENTICATION_EVENT is not available. + */ + authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1; +#ifdef SCTP_AUTH_DEACTIVATE_KEY + sockopt_len = sizeof(struct sctp_authkeyid); + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DEACTIVATE_KEY, + &authkeyid, sockopt_len); + if (ret < 0) break; +#endif +#ifndef SCTP_AUTHENTICATION_EVENT + if (authkeyid.scact_keynumber > 0) + { + authkeyid.scact_keynumber = authkeyid.scact_keynumber - 1; + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_AUTH_DELETE_KEY, + &authkeyid, sizeof(struct sctp_authkeyid)); + if (ret < 0) break; + } +#endif + + data->ccs_rcvd = 0; + data->ccs_sent = 0; + } + break; + case BIO_CTRL_DGRAM_SCTP_GET_SNDINFO: + /* Returns the size of the copied struct. */ + if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo)) + num = sizeof(struct bio_dgram_sctp_sndinfo); + + memcpy(ptr, &(data->sndinfo), num); + ret = num; + break; + case BIO_CTRL_DGRAM_SCTP_SET_SNDINFO: + /* Returns the size of the copied struct. */ + if (num > (long) sizeof(struct bio_dgram_sctp_sndinfo)) + num = sizeof(struct bio_dgram_sctp_sndinfo); + + memcpy(&(data->sndinfo), ptr, num); + break; + case BIO_CTRL_DGRAM_SCTP_GET_RCVINFO: + /* Returns the size of the copied struct. */ + if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo)) + num = sizeof(struct bio_dgram_sctp_rcvinfo); + + memcpy(ptr, &data->rcvinfo, num); + + ret = num; + break; + case BIO_CTRL_DGRAM_SCTP_SET_RCVINFO: + /* Returns the size of the copied struct. */ + if (num > (long) sizeof(struct bio_dgram_sctp_rcvinfo)) + num = sizeof(struct bio_dgram_sctp_rcvinfo); + + memcpy(&(data->rcvinfo), ptr, num); + break; + case BIO_CTRL_DGRAM_SCTP_GET_PRINFO: + /* Returns the size of the copied struct. */ + if (num > (long) sizeof(struct bio_dgram_sctp_prinfo)) + num = sizeof(struct bio_dgram_sctp_prinfo); + + memcpy(ptr, &(data->prinfo), num); + ret = num; + break; + case BIO_CTRL_DGRAM_SCTP_SET_PRINFO: + /* Returns the size of the copied struct. */ + if (num > (long) sizeof(struct bio_dgram_sctp_prinfo)) + num = sizeof(struct bio_dgram_sctp_prinfo); + + memcpy(&(data->prinfo), ptr, num); + break; + case BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN: + /* Returns always 1. */ + if (num > 0) + data->save_shutdown = 1; + else + data->save_shutdown = 0; + break; + + default: + /* Pass to default ctrl function to + * process SCTP unspecific commands + */ + ret=dgram_ctrl(b, cmd, num, ptr); + break; + } + return(ret); + } + +int BIO_dgram_sctp_notification_cb(BIO *b, + void (*handle_notifications)(BIO *bio, void *context, void *buf), + void *context) + { + bio_dgram_sctp_data *data = (bio_dgram_sctp_data *) b->ptr; + + if (handle_notifications != NULL) + { + data->handle_notifications = handle_notifications; + data->notification_context = context; + } + else + return -1; + + return 0; + } + +int BIO_dgram_sctp_wait_for_dry(BIO *b) +{ + int is_dry = 0; + int n, sockflags, ret; + union sctp_notification snp; + struct msghdr msg; + struct iovec iov; +#ifdef SCTP_EVENT + struct sctp_event event; +#else + struct sctp_event_subscribe event; + socklen_t eventsize; +#endif + bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr; + + /* set sender dry event */ +#ifdef SCTP_EVENT + memset(&event, 0, sizeof(struct sctp_event)); + event.se_assoc_id = 0; + event.se_type = SCTP_SENDER_DRY_EVENT; + event.se_on = 1; + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event)); +#else + eventsize = sizeof(struct sctp_event_subscribe); + ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize); + if (ret < 0) + return -1; + + event.sctp_sender_dry_event = 1; + + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe)); +#endif + if (ret < 0) + return -1; + + /* peek for notification */ + memset(&snp, 0x00, sizeof(union sctp_notification)); + iov.iov_base = (char *)&snp; + iov.iov_len = sizeof(union sctp_notification); + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + n = recvmsg(b->num, &msg, MSG_PEEK); + if (n <= 0) + { + if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK)) + return -1; + else + return 0; + } + + /* if we find a notification, process it and try again if necessary */ + while (msg.msg_flags & MSG_NOTIFICATION) + { + memset(&snp, 0x00, sizeof(union sctp_notification)); + iov.iov_base = (char *)&snp; + iov.iov_len = sizeof(union sctp_notification); + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + n = recvmsg(b->num, &msg, 0); + if (n <= 0) + { + if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK)) + return -1; + else + return is_dry; + } + + if (snp.sn_header.sn_type == SCTP_SENDER_DRY_EVENT) + { + is_dry = 1; + + /* disable sender dry event */ +#ifdef SCTP_EVENT + memset(&event, 0, sizeof(struct sctp_event)); + event.se_assoc_id = 0; + event.se_type = SCTP_SENDER_DRY_EVENT; + event.se_on = 0; + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENT, &event, sizeof(struct sctp_event)); +#else + eventsize = (socklen_t) sizeof(struct sctp_event_subscribe); + ret = getsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, &eventsize); + if (ret < 0) + return -1; + + event.sctp_sender_dry_event = 0; + + ret = setsockopt(b->num, IPPROTO_SCTP, SCTP_EVENTS, &event, sizeof(struct sctp_event_subscribe)); +#endif + if (ret < 0) + return -1; + } + +#ifdef SCTP_AUTHENTICATION_EVENT + if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT) + dgram_sctp_handle_auth_free_key_event(b, &snp); +#endif + + if (data->handle_notifications != NULL) + data->handle_notifications(b, data->notification_context, (void*) &snp); + + /* found notification, peek again */ + memset(&snp, 0x00, sizeof(union sctp_notification)); + iov.iov_base = (char *)&snp; + iov.iov_len = sizeof(union sctp_notification); + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + /* if we have seen the dry already, don't wait */ + if (is_dry) + { + sockflags = fcntl(b->num, F_GETFL, 0); + fcntl(b->num, F_SETFL, O_NONBLOCK); + } + + n = recvmsg(b->num, &msg, MSG_PEEK); + + if (is_dry) + { + fcntl(b->num, F_SETFL, sockflags); + } + + if (n <= 0) + { + if ((n < 0) && (get_last_socket_error() != EAGAIN) && (get_last_socket_error() != EWOULDBLOCK)) + return -1; + else + return is_dry; + } + } + + /* read anything else */ + return is_dry; +} + +int BIO_dgram_sctp_msg_waiting(BIO *b) + { + int n, sockflags; + union sctp_notification snp; + struct msghdr msg; + struct iovec iov; + bio_dgram_sctp_data *data = (bio_dgram_sctp_data *)b->ptr; + + /* Check if there are any messages waiting to be read */ + do + { + memset(&snp, 0x00, sizeof(union sctp_notification)); + iov.iov_base = (char *)&snp; + iov.iov_len = sizeof(union sctp_notification); + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + sockflags = fcntl(b->num, F_GETFL, 0); + fcntl(b->num, F_SETFL, O_NONBLOCK); + n = recvmsg(b->num, &msg, MSG_PEEK); + fcntl(b->num, F_SETFL, sockflags); + + /* if notification, process and try again */ + if (n > 0 && (msg.msg_flags & MSG_NOTIFICATION)) + { +#ifdef SCTP_AUTHENTICATION_EVENT + if (snp.sn_header.sn_type == SCTP_AUTHENTICATION_EVENT) + dgram_sctp_handle_auth_free_key_event(b, &snp); +#endif + + memset(&snp, 0x00, sizeof(union sctp_notification)); + iov.iov_base = (char *)&snp; + iov.iov_len = sizeof(union sctp_notification); + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + n = recvmsg(b->num, &msg, 0); + + if (data->handle_notifications != NULL) + data->handle_notifications(b, data->notification_context, (void*) &snp); + } + + } while (n > 0 && (msg.msg_flags & MSG_NOTIFICATION)); + + /* Return 1 if there is a message to be read, return 0 otherwise. */ + if (n > 0) + return 1; + else + return 0; + } + +static int dgram_sctp_puts(BIO *bp, const char *str) + { + int n,ret; + + n=strlen(str); + ret=dgram_sctp_write(bp,str,n); + return(ret); + } +#endif + static int BIO_dgram_should_retry(int i) { int err; diff --git a/src/lib/libcrypto/bn/asm/armv4-gf2m.pl b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl new file mode 100644 index 0000000000..c52e0b75b5 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/armv4-gf2m.pl @@ -0,0 +1,278 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication +# used in bn_gf2m.c. It's kind of low-hanging mechanical port from +# C for the time being... Except that it has two code paths: pure +# integer code suitable for any ARMv4 and later CPU and NEON code +# suitable for ARMv7. Pure integer 1x1 multiplication subroutine runs +# in ~45 cycles on dual-issue core such as Cortex A8, which is ~50% +# faster than compiler-generated code. For ECDH and ECDSA verify (but +# not for ECDSA sign) it means 25%-45% improvement depending on key +# length, more for longer keys. Even though NEON 1x1 multiplication +# runs in even less cycles, ~30, improvement is measurable only on +# longer keys. One has to optimize code elsewhere to get NEON glow... + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } +sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } + +$code=<<___; +#include "arm_arch.h" + +.text +.code 32 + +#if __ARM_ARCH__>=7 +.fpu neon + +.type mul_1x1_neon,%function +.align 5 +mul_1x1_neon: + vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a + vmull.p8 `&Q("d0")`,d16,d17 @ a·bb + vshl.u64 `&Dlo("q2")`,d16,#16 + vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8·bb + vshl.u64 `&Dlo("q3")`,d16,#24 + vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16·bb + vshr.u64 `&Dlo("q1")`,#8 + vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24·bb + vshl.u64 `&Dhi("q1")`,#24 + veor d0,`&Dlo("q1")` + vshr.u64 `&Dlo("q2")`,#16 + veor d0,`&Dhi("q1")` + vshl.u64 `&Dhi("q2")`,#16 + veor d0,`&Dlo("q2")` + vshr.u64 `&Dlo("q3")`,#24 + veor d0,`&Dhi("q2")` + vshl.u64 `&Dhi("q3")`,#8 + veor d0,`&Dlo("q3")` + veor d0,`&Dhi("q3")` + bx lr +.size mul_1x1_neon,.-mul_1x1_neon +#endif +___ +################ +# private interface to mul_1x1_ialu +# +$a="r1"; +$b="r0"; + +($a0,$a1,$a2,$a12,$a4,$a14)= +($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12); + +$mask="r12"; + +$code.=<<___; +.type mul_1x1_ialu,%function +.align 5 +mul_1x1_ialu: + mov $a0,#0 + bic $a1,$a,#3<<30 @ a1=a&0x3fffffff + str $a0,[sp,#0] @ tab[0]=0 + add $a2,$a1,$a1 @ a2=a1<<1 + str $a1,[sp,#4] @ tab[1]=a1 + eor $a12,$a1,$a2 @ a1^a2 + str $a2,[sp,#8] @ tab[2]=a2 + mov $a4,$a1,lsl#2 @ a4=a1<<2 + str $a12,[sp,#12] @ tab[3]=a1^a2 + eor $a14,$a1,$a4 @ a1^a4 + str $a4,[sp,#16] @ tab[4]=a4 + eor $a0,$a2,$a4 @ a2^a4 + str $a14,[sp,#20] @ tab[5]=a1^a4 + eor $a12,$a12,$a4 @ a1^a2^a4 + str $a0,[sp,#24] @ tab[6]=a2^a4 + and $i0,$mask,$b,lsl#2 + str $a12,[sp,#28] @ tab[7]=a1^a2^a4 + + and $i1,$mask,$b,lsr#1 + ldr $lo,[sp,$i0] @ tab[b & 0x7] + and $i0,$mask,$b,lsr#4 + ldr $t1,[sp,$i1] @ tab[b >> 3 & 0x7] + and $i1,$mask,$b,lsr#7 + ldr $t0,[sp,$i0] @ tab[b >> 6 & 0x7] + eor $lo,$lo,$t1,lsl#3 @ stall + mov $hi,$t1,lsr#29 + ldr $t1,[sp,$i1] @ tab[b >> 9 & 0x7] + + and $i0,$mask,$b,lsr#10 + eor $lo,$lo,$t0,lsl#6 + eor $hi,$hi,$t0,lsr#26 + ldr $t0,[sp,$i0] @ tab[b >> 12 & 0x7] + + and $i1,$mask,$b,lsr#13 + eor $lo,$lo,$t1,lsl#9 + eor $hi,$hi,$t1,lsr#23 + ldr $t1,[sp,$i1] @ tab[b >> 15 & 0x7] + + and $i0,$mask,$b,lsr#16 + eor $lo,$lo,$t0,lsl#12 + eor $hi,$hi,$t0,lsr#20 + ldr $t0,[sp,$i0] @ tab[b >> 18 & 0x7] + + and $i1,$mask,$b,lsr#19 + eor $lo,$lo,$t1,lsl#15 + eor $hi,$hi,$t1,lsr#17 + ldr $t1,[sp,$i1] @ tab[b >> 21 & 0x7] + + and $i0,$mask,$b,lsr#22 + eor $lo,$lo,$t0,lsl#18 + eor $hi,$hi,$t0,lsr#14 + ldr $t0,[sp,$i0] @ tab[b >> 24 & 0x7] + + and $i1,$mask,$b,lsr#25 + eor $lo,$lo,$t1,lsl#21 + eor $hi,$hi,$t1,lsr#11 + ldr $t1,[sp,$i1] @ tab[b >> 27 & 0x7] + + tst $a,#1<<30 + and $i0,$mask,$b,lsr#28 + eor $lo,$lo,$t0,lsl#24 + eor $hi,$hi,$t0,lsr#8 + ldr $t0,[sp,$i0] @ tab[b >> 30 ] + + eorne $lo,$lo,$b,lsl#30 + eorne $hi,$hi,$b,lsr#2 + tst $a,#1<<31 + eor $lo,$lo,$t1,lsl#27 + eor $hi,$hi,$t1,lsr#5 + eorne $lo,$lo,$b,lsl#31 + eorne $hi,$hi,$b,lsr#1 + eor $lo,$lo,$t0,lsl#30 + eor $hi,$hi,$t0,lsr#2 + + mov pc,lr +.size mul_1x1_ialu,.-mul_1x1_ialu +___ +################ +# void bn_GF2m_mul_2x2(BN_ULONG *r, +# BN_ULONG a1,BN_ULONG a0, +# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0 + +($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23)); + +$code.=<<___; +.global bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,%function +.align 5 +bn_GF2m_mul_2x2: +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap +.Lpic: ldr r12,[pc,r12] + tst r12,#1 + beq .Lialu + + veor $A1,$A1 + vmov.32 $B1,r3,r3 @ two copies of b1 + vmov.32 ${A1}[0],r1 @ a1 + + veor $A0,$A0 + vld1.32 ${B0}[],[sp,:32] @ two copies of b0 + vmov.32 ${A0}[0],r2 @ a0 + mov r12,lr + + vmov d16,$A1 + vmov d17,$B1 + bl mul_1x1_neon @ a1·b1 + vmov $A1B1,d0 + + vmov d16,$A0 + vmov d17,$B0 + bl mul_1x1_neon @ a0·b0 + vmov $A0B0,d0 + + veor d16,$A0,$A1 + veor d17,$B0,$B1 + veor $A0,$A0B0,$A1B1 + bl mul_1x1_neon @ (a0+a1)·(b0+b1) + + veor d0,$A0 @ (a0+a1)·(b0+b1)-a0·b0-a1·b1 + vshl.u64 d1,d0,#32 + vshr.u64 d0,d0,#32 + veor $A0B0,d1 + veor $A1B1,d0 + vst1.32 {${A0B0}[0]},[r0,:32]! + vst1.32 {${A0B0}[1]},[r0,:32]! + vst1.32 {${A1B1}[0]},[r0,:32]! + vst1.32 {${A1B1}[1]},[r0,:32] + bx r12 +.align 4 +.Lialu: +#endif +___ +$ret="r10"; # reassigned 1st argument +$code.=<<___; + stmdb sp!,{r4-r10,lr} + mov $ret,r0 @ reassign 1st argument + mov $b,r3 @ $b=b1 + ldr r3,[sp,#32] @ load b0 + mov $mask,#7<<2 + sub sp,sp,#32 @ allocate tab[8] + + bl mul_1x1_ialu @ a1·b1 + str $lo,[$ret,#8] + str $hi,[$ret,#12] + + eor $b,$b,r3 @ flip b0 and b1 + eor $a,$a,r2 @ flip a0 and a1 + eor r3,r3,$b + eor r2,r2,$a + eor $b,$b,r3 + eor $a,$a,r2 + bl mul_1x1_ialu @ a0·b0 + str $lo,[$ret] + str $hi,[$ret,#4] + + eor $a,$a,r2 + eor $b,$b,r3 + bl mul_1x1_ialu @ (a1+a0)·(b1+b0) +___ +@r=map("r$_",(6..9)); +$code.=<<___; + ldmia $ret,{@r[0]-@r[3]} + eor $lo,$lo,$hi + eor $hi,$hi,@r[1] + eor $lo,$lo,@r[0] + eor $hi,$hi,@r[2] + eor $lo,$lo,@r[3] + eor $hi,$hi,@r[3] + str $hi,[$ret,#8] + eor $lo,$lo,$hi + add sp,sp,#32 @ destroy tab[8] + str $lo,[$ret,#4] + +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r10,pc} +#else + ldmia sp!,{r4-r10,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +#if __ARM_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-(.Lpic+8) +#endif +.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by " +.align 5 + +.comm OPENSSL_armcap_P,4,4 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; # enforce flush diff --git a/src/lib/libcrypto/bn/asm/armv4-mont.pl b/src/lib/libcrypto/bn/asm/armv4-mont.pl index 14e0d2d1dd..f78a8b5f0f 100644 --- a/src/lib/libcrypto/bn/asm/armv4-mont.pl +++ b/src/lib/libcrypto/bn/asm/armv4-mont.pl @@ -23,6 +23,9 @@ # than 1/2KB. Windows CE port would be trivial, as it's exclusively # about decorations, ABI and instruction syntax are identical. +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + $num="r0"; # starts as num argument, but holds &tp[num-1] $ap="r1"; $bp="r2"; $bi="r2"; $rp="r2"; @@ -89,9 +92,9 @@ bn_mul_mont: .L1st: ldr $aj,[$ap],#4 @ ap[j],ap++ mov $alo,$ahi + ldr $nj,[$np],#4 @ np[j],np++ mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0] - ldr $nj,[$np],#4 @ np[j],np++ mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 adds $nlo,$nlo,$alo @@ -101,21 +104,21 @@ bn_mul_mont: bne .L1st adds $nlo,$nlo,$ahi + ldr $tp,[$_bp] @ restore bp mov $nhi,#0 + ldr $n0,[$_n0] @ restore n0 adc $nhi,$nhi,#0 - ldr $tp,[$_bp] @ restore bp str $nlo,[$num] @ tp[num-1]= - ldr $n0,[$_n0] @ restore n0 str $nhi,[$num,#4] @ tp[num]= .Louter: sub $tj,$num,sp @ "original" $num-1 value sub $ap,$ap,$tj @ "rewind" ap to &ap[1] - sub $np,$np,$tj @ "rewind" np to &np[1] ldr $bi,[$tp,#4]! @ *(++bp) + sub $np,$np,$tj @ "rewind" np to &np[1] ldr $aj,[$ap,#-4] @ ap[0] - ldr $nj,[$np,#-4] @ np[0] ldr $alo,[sp] @ tp[0] + ldr $nj,[$np,#-4] @ np[0] ldr $tj,[sp,#4] @ tp[1] mov $ahi,#0 @@ -129,13 +132,13 @@ bn_mul_mont: .Linner: ldr $aj,[$ap],#4 @ ap[j],ap++ adds $alo,$ahi,$tj @ +=tp[j] + ldr $nj,[$np],#4 @ np[j],np++ mov $ahi,#0 umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i] - ldr $nj,[$np],#4 @ np[j],np++ mov $nhi,#0 umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0 - ldr $tj,[$tp,#8] @ tp[j+1] adc $ahi,$ahi,#0 + ldr $tj,[$tp,#8] @ tp[j+1] adds $nlo,$nlo,$alo str $nlo,[$tp],#4 @ tp[j-1]=,tp++ adc $nlo,$nhi,#0 @@ -144,13 +147,13 @@ bn_mul_mont: adds $nlo,$nlo,$ahi mov $nhi,#0 + ldr $tp,[$_bp] @ restore bp adc $nhi,$nhi,#0 + ldr $n0,[$_n0] @ restore n0 adds $nlo,$nlo,$tj - adc $nhi,$nhi,#0 - ldr $tp,[$_bp] @ restore bp ldr $tj,[$_bpend] @ restore &bp[num] + adc $nhi,$nhi,#0 str $nlo,[$num] @ tp[num-1]= - ldr $n0,[$_n0] @ restore n0 str $nhi,[$num,#4] @ tp[num]= cmp $tp,$tj diff --git a/src/lib/libcrypto/bn/asm/ia64-mont.pl b/src/lib/libcrypto/bn/asm/ia64-mont.pl new file mode 100644 index 0000000000..e258658428 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/ia64-mont.pl @@ -0,0 +1,851 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# January 2010 +# +# "Teaser" Montgomery multiplication module for IA-64. There are +# several possibilities for improvement: +# +# - modulo-scheduling outer loop would eliminate quite a number of +# stalls after ldf8, xma and getf.sig outside inner loop and +# improve shorter key performance; +# - shorter vector support [with input vectors being fetched only +# once] should be added; +# - 2x unroll with help of n0[1] would make the code scalable on +# "wider" IA-64, "wider" than Itanium 2 that is, which is not of +# acute interest, because upcoming Tukwila's individual cores are +# reportedly based on Itanium 2 design; +# - dedicated squaring procedure(?); +# +# January 2010 +# +# Shorter vector support is implemented by zero-padding ap and np +# vectors up to 8 elements, or 512 bits. This means that 256-bit +# inputs will be processed only 2 times faster than 512-bit inputs, +# not 4 [as one would expect, because algorithm complexity is n^2]. +# The reason for padding is that inputs shorter than 512 bits won't +# be processed faster anyway, because minimal critical path of the +# core loop happens to match 512-bit timing. Either way, it resulted +# in >100% improvement of 512-bit RSA sign benchmark and 50% - of +# 1024-bit one [in comparison to original version of *this* module]. +# +# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with* +# this module is: +# sign verify sign/s verify/s +# rsa 512 bits 0.000290s 0.000024s 3452.8 42031.4 +# rsa 1024 bits 0.000793s 0.000058s 1261.7 17172.0 +# rsa 2048 bits 0.005908s 0.000148s 169.3 6754.0 +# rsa 4096 bits 0.033456s 0.000469s 29.9 2133.6 +# dsa 512 bits 0.000253s 0.000198s 3949.9 5057.0 +# dsa 1024 bits 0.000585s 0.000607s 1708.4 1647.4 +# dsa 2048 bits 0.001453s 0.001703s 688.1 587.4 +# +# ... and *without* (but still with ia64.S): +# +# rsa 512 bits 0.000670s 0.000041s 1491.8 24145.5 +# rsa 1024 bits 0.001988s 0.000080s 502.9 12499.3 +# rsa 2048 bits 0.008702s 0.000189s 114.9 5293.9 +# rsa 4096 bits 0.043860s 0.000533s 22.8 1875.9 +# dsa 512 bits 0.000441s 0.000427s 2265.3 2340.6 +# dsa 1024 bits 0.000823s 0.000867s 1215.6 1153.2 +# dsa 2048 bits 0.001894s 0.002179s 528.1 458.9 +# +# As it can be seen, RSA sign performance improves by 130-30%, +# hereafter less for longer keys, while verify - by 74-13%. +# DSA performance improves by 115-30%. + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } + +$code=<<___; +.explicit +.text + +// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap, +// const BN_ULONG *bp,const BN_ULONG *np, +// const BN_ULONG *n0p,int num); +.align 64 +.global bn_mul_mont# +.proc bn_mul_mont# +bn_mul_mont: + .prologue + .body +{ .mmi; cmp4.le p6,p7=2,r37;; +(p6) cmp4.lt.unc p8,p9=8,r37 + mov ret0=r0 };; +{ .bbb; +(p9) br.cond.dptk.many bn_mul_mont_8 +(p8) br.cond.dpnt.many bn_mul_mont_general +(p7) br.ret.spnt.many b0 };; +.endp bn_mul_mont# + +prevfs=r2; prevpr=r3; prevlc=r10; prevsp=r11; + +rptr=r8; aptr=r9; bptr=r14; nptr=r15; +tptr=r16; // &tp[0] +tp_1=r17; // &tp[-1] +num=r18; len=r19; lc=r20; +topbit=r21; // carry bit from tmp[num] + +n0=f6; +m0=f7; +bi=f8; + +.align 64 +.local bn_mul_mont_general# +.proc bn_mul_mont_general# +bn_mul_mont_general: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,6,2,0,8 + $ADDP aptr=0,in1 + .save ar.lc,prevlc + mov prevlc=ar.lc } +{ .mmi; .vframe prevsp + mov prevsp=sp + $ADDP bptr=0,in2 + .save pr,prevpr + mov prevpr=pr };; + + .body + .rotf alo[6],nlo[4],ahi[8],nhi[6] + .rotr a[3],n[3],t[2] + +{ .mmi; ldf8 bi=[bptr],8 // (*bp++) + ldf8 alo[4]=[aptr],16 // ap[0] + $ADDP r30=8,in1 };; +{ .mmi; ldf8 alo[3]=[r30],16 // ap[1] + ldf8 alo[2]=[aptr],16 // ap[2] + $ADDP in4=0,in4 };; +{ .mmi; ldf8 alo[1]=[r30] // ap[3] + ldf8 n0=[in4] // n0 + $ADDP rptr=0,in0 } +{ .mmi; $ADDP nptr=0,in3 + mov r31=16 + zxt4 num=in5 };; +{ .mmi; ldf8 nlo[2]=[nptr],8 // np[0] + shladd len=num,3,r0 + shladd r31=num,3,r31 };; +{ .mmi; ldf8 nlo[1]=[nptr],8 // np[1] + add lc=-5,num + sub r31=sp,r31 };; +{ .mfb; and sp=-16,r31 // alloca + xmpy.hu ahi[2]=alo[4],bi // ap[0]*bp[0] + nop.b 0 } +{ .mfb; nop.m 0 + xmpy.lu alo[4]=alo[4],bi + brp.loop.imp .L1st_ctop,.L1st_cend-16 + };; +{ .mfi; nop.m 0 + xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0] + add tp_1=8,sp } +{ .mfi; nop.m 0 + xma.lu alo[3]=alo[3],bi,ahi[2] + mov pr.rot=0x20001f<<16 + // ------^----- (p40) at first (p23) + // ----------^^ p[16:20]=1 + };; +{ .mfi; nop.m 0 + xmpy.lu m0=alo[4],n0 // (ap[0]*bp[0])*n0 + mov ar.lc=lc } +{ .mfi; nop.m 0 + fcvt.fxu.s1 nhi[1]=f0 + mov ar.ec=8 };; + +.align 32 +.L1st_ctop: +.pred.rel "mutex",p40,p42 +{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) + (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] + (p40) add n[2]=n[2],a[2] } // (p23) } +{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++)(p16) + (p18) xma.lu alo[2]=alo[2],bi,ahi[1] + (p42) add n[2]=n[2],a[2],1 };; // (p23) +{ .mfi; (p21) getf.sig a[0]=alo[5] + (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] + (p42) cmp.leu p41,p39=n[2],a[2] } // (p23) +{ .mfi; (p23) st8 [tp_1]=n[2],8 + (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] + (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) +{ .mmb; (p21) getf.sig n[0]=nlo[3] + (p16) nop.m 0 + br.ctop.sptk .L1st_ctop };; +.L1st_cend: + +{ .mmi; getf.sig a[0]=ahi[6] // (p24) + getf.sig n[0]=nhi[4] + add num=-1,num };; // num-- +{ .mmi; .pred.rel "mutex",p40,p42 +(p40) add n[0]=n[0],a[0] +(p42) add n[0]=n[0],a[0],1 + sub aptr=aptr,len };; // rewind +{ .mmi; .pred.rel "mutex",p40,p42 +(p40) cmp.ltu p41,p39=n[0],a[0] +(p42) cmp.leu p41,p39=n[0],a[0] + sub nptr=nptr,len };; +{ .mmi; .pred.rel "mutex",p39,p41 +(p39) add topbit=r0,r0 +(p41) add topbit=r0,r0,1 + nop.i 0 } +{ .mmi; st8 [tp_1]=n[0] + add tptr=16,sp + add tp_1=8,sp };; + +.Louter: +{ .mmi; ldf8 bi=[bptr],8 // (*bp++) + ldf8 ahi[3]=[tptr] // tp[0] + add r30=8,aptr };; +{ .mmi; ldf8 alo[4]=[aptr],16 // ap[0] + ldf8 alo[3]=[r30],16 // ap[1] + add r31=8,nptr };; +{ .mfb; ldf8 alo[2]=[aptr],16 // ap[2] + xma.hu ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0] + brp.loop.imp .Linner_ctop,.Linner_cend-16 + } +{ .mfb; ldf8 alo[1]=[r30] // ap[3] + xma.lu alo[4]=alo[4],bi,ahi[3] + clrrrb.pr };; +{ .mfi; ldf8 nlo[2]=[nptr],16 // np[0] + xma.hu ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i] + nop.i 0 } +{ .mfi; ldf8 nlo[1]=[r31] // np[1] + xma.lu alo[3]=alo[3],bi,ahi[2] + mov pr.rot=0x20101f<<16 + // ------^----- (p40) at first (p23) + // --------^--- (p30) at first (p22) + // ----------^^ p[16:20]=1 + };; +{ .mfi; st8 [tptr]=r0 // tp[0] is already accounted + xmpy.lu m0=alo[4],n0 // (ap[0]*bp[i]+tp[0])*n0 + mov ar.lc=lc } +{ .mfi; + fcvt.fxu.s1 nhi[1]=f0 + mov ar.ec=8 };; + +// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in +// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7 +// in latter case accounts for two-tick pipeline stall, which means +// that its performance would be ~20% lower than optimal one. No +// attempt was made to address this, because original Itanium is +// hardly represented out in the wild... +.align 32 +.Linner_ctop: +.pred.rel "mutex",p40,p42 +.pred.rel "mutex",p30,p32 +{ .mfi; (p16) ldf8 alo[0]=[aptr],8 // *(aptr++) + (p18) xma.hu ahi[0]=alo[2],bi,ahi[1] + (p40) add n[2]=n[2],a[2] } // (p23) +{ .mfi; (p16) nop.m 0 + (p18) xma.lu alo[2]=alo[2],bi,ahi[1] + (p42) add n[2]=n[2],a[2],1 };; // (p23) +{ .mfi; (p21) getf.sig a[0]=alo[5] + (p16) nop.f 0 + (p40) cmp.ltu p41,p39=n[2],a[2] } // (p23) +{ .mfi; (p21) ld8 t[0]=[tptr],8 + (p16) nop.f 0 + (p42) cmp.leu p41,p39=n[2],a[2] };; // (p23) +{ .mfi; (p18) ldf8 nlo[0]=[nptr],8 // *(nptr++) + (p20) xma.hu nhi[0]=nlo[2],m0,nhi[1] + (p30) add a[1]=a[1],t[1] } // (p22) +{ .mfi; (p16) nop.m 0 + (p20) xma.lu nlo[2]=nlo[2],m0,nhi[1] + (p32) add a[1]=a[1],t[1],1 };; // (p22) +{ .mmi; (p21) getf.sig n[0]=nlo[3] + (p16) nop.m 0 + (p30) cmp.ltu p31,p29=a[1],t[1] } // (p22) +{ .mmb; (p23) st8 [tp_1]=n[2],8 + (p32) cmp.leu p31,p29=a[1],t[1] // (p22) + br.ctop.sptk .Linner_ctop };; +.Linner_cend: + +{ .mmi; getf.sig a[0]=ahi[6] // (p24) + getf.sig n[0]=nhi[4] + nop.i 0 };; + +{ .mmi; .pred.rel "mutex",p31,p33 +(p31) add a[0]=a[0],topbit +(p33) add a[0]=a[0],topbit,1 + mov topbit=r0 };; +{ .mfi; .pred.rel "mutex",p31,p33 +(p31) cmp.ltu p32,p30=a[0],topbit +(p33) cmp.leu p32,p30=a[0],topbit + } +{ .mfi; .pred.rel "mutex",p40,p42 +(p40) add n[0]=n[0],a[0] +(p42) add n[0]=n[0],a[0],1 + };; +{ .mmi; .pred.rel "mutex",p44,p46 +(p40) cmp.ltu p41,p39=n[0],a[0] +(p42) cmp.leu p41,p39=n[0],a[0] +(p32) add topbit=r0,r0,1 } + +{ .mmi; st8 [tp_1]=n[0],8 + cmp4.ne p6,p0=1,num + sub aptr=aptr,len };; // rewind +{ .mmi; sub nptr=nptr,len +(p41) add topbit=r0,r0,1 + add tptr=16,sp } +{ .mmb; add tp_1=8,sp + add num=-1,num // num-- +(p6) br.cond.sptk.many .Louter };; + +{ .mbb; add lc=4,lc + brp.loop.imp .Lsub_ctop,.Lsub_cend-16 + clrrrb.pr };; +{ .mii; nop.m 0 + mov pr.rot=0x10001<<16 + // ------^---- (p33) at first (p17) + mov ar.lc=lc } +{ .mii; nop.m 0 + mov ar.ec=3 + nop.i 0 };; + +.Lsub_ctop: +.pred.rel "mutex",p33,p35 +{ .mfi; (p16) ld8 t[0]=[tptr],8 // t=*(tp++) + (p16) nop.f 0 + (p33) sub n[1]=t[1],n[1] } // (p17) +{ .mfi; (p16) ld8 n[0]=[nptr],8 // n=*(np++) + (p16) nop.f 0 + (p35) sub n[1]=t[1],n[1],1 };; // (p17) +{ .mib; (p18) st8 [rptr]=n[2],8 // *(rp++)=r + (p33) cmp.gtu p34,p32=n[1],t[1] // (p17) + (p18) nop.b 0 } +{ .mib; (p18) nop.m 0 + (p35) cmp.geu p34,p32=n[1],t[1] // (p17) + br.ctop.sptk .Lsub_ctop };; +.Lsub_cend: + +{ .mmb; .pred.rel "mutex",p34,p36 +(p34) sub topbit=topbit,r0 // (p19) +(p36) sub topbit=topbit,r0,1 + brp.loop.imp .Lcopy_ctop,.Lcopy_cend-16 + } +{ .mmb; sub rptr=rptr,len // rewind + sub tptr=tptr,len + clrrrb.pr };; +{ .mmi; and aptr=tptr,topbit + andcm bptr=rptr,topbit + mov pr.rot=1<<16 };; +{ .mii; or nptr=aptr,bptr + mov ar.lc=lc + mov ar.ec=3 };; + +.Lcopy_ctop: +{ .mmb; (p16) ld8 n[0]=[nptr],8 + (p18) st8 [tptr]=r0,8 + (p16) nop.b 0 } +{ .mmb; (p16) nop.m 0 + (p18) st8 [rptr]=n[2],8 + br.ctop.sptk .Lcopy_ctop };; +.Lcopy_cend: + +{ .mmi; mov ret0=1 // signal "handled" + rum 1<<5 // clear um.mfh + mov ar.lc=prevlc } +{ .mib; .restore sp + mov sp=prevsp + mov pr=prevpr,0x1ffff + br.ret.sptk.many b0 };; +.endp bn_mul_mont_general# + +a1=r16; a2=r17; a3=r18; a4=r19; a5=r20; a6=r21; a7=r22; a8=r23; +n1=r24; n2=r25; n3=r26; n4=r27; n5=r28; n6=r29; n7=r30; n8=r31; +t0=r15; + +ai0=f8; ai1=f9; ai2=f10; ai3=f11; ai4=f12; ai5=f13; ai6=f14; ai7=f15; +ni0=f16; ni1=f17; ni2=f18; ni3=f19; ni4=f20; ni5=f21; ni6=f22; ni7=f23; + +.align 64 +.skip 48 // aligns loop body +.local bn_mul_mont_8# +.proc bn_mul_mont_8# +bn_mul_mont_8: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,6,2,0,8 + .vframe prevsp + mov prevsp=sp + .save ar.lc,prevlc + mov prevlc=ar.lc } +{ .mmi; add r17=-6*16,sp + add sp=-7*16,sp + .save pr,prevpr + mov prevpr=pr };; + +{ .mmi; .save.gf 0,0x10 + stf.spill [sp]=f16,-16 + .save.gf 0,0x20 + stf.spill [r17]=f17,32 + add r16=-5*16,prevsp};; +{ .mmi; .save.gf 0,0x40 + stf.spill [r16]=f18,32 + .save.gf 0,0x80 + stf.spill [r17]=f19,32 + $ADDP aptr=0,in1 };; +{ .mmi; .save.gf 0,0x100 + stf.spill [r16]=f20,32 + .save.gf 0,0x200 + stf.spill [r17]=f21,32 + $ADDP r29=8,in1 };; +{ .mmi; .save.gf 0,0x400 + stf.spill [r16]=f22 + .save.gf 0,0x800 + stf.spill [r17]=f23 + $ADDP rptr=0,in0 };; + + .body + .rotf bj[8],mj[2],tf[2],alo[10],ahi[10],nlo[10],nhi[10] + .rotr t[8] + +// load input vectors padding them to 8 elements +{ .mmi; ldf8 ai0=[aptr],16 // ap[0] + ldf8 ai1=[r29],16 // ap[1] + $ADDP bptr=0,in2 } +{ .mmi; $ADDP r30=8,in2 + $ADDP nptr=0,in3 + $ADDP r31=8,in3 };; +{ .mmi; ldf8 bj[7]=[bptr],16 // bp[0] + ldf8 bj[6]=[r30],16 // bp[1] + cmp4.le p4,p5=3,in5 } +{ .mmi; ldf8 ni0=[nptr],16 // np[0] + ldf8 ni1=[r31],16 // np[1] + cmp4.le p6,p7=4,in5 };; + +{ .mfi; (p4)ldf8 ai2=[aptr],16 // ap[2] + (p5)fcvt.fxu ai2=f0 + cmp4.le p8,p9=5,in5 } +{ .mfi; (p6)ldf8 ai3=[r29],16 // ap[3] + (p7)fcvt.fxu ai3=f0 + cmp4.le p10,p11=6,in5 } +{ .mfi; (p4)ldf8 bj[5]=[bptr],16 // bp[2] + (p5)fcvt.fxu bj[5]=f0 + cmp4.le p12,p13=7,in5 } +{ .mfi; (p6)ldf8 bj[4]=[r30],16 // bp[3] + (p7)fcvt.fxu bj[4]=f0 + cmp4.le p14,p15=8,in5 } +{ .mfi; (p4)ldf8 ni2=[nptr],16 // np[2] + (p5)fcvt.fxu ni2=f0 + addp4 r28=-1,in5 } +{ .mfi; (p6)ldf8 ni3=[r31],16 // np[3] + (p7)fcvt.fxu ni3=f0 + $ADDP in4=0,in4 };; + +{ .mfi; ldf8 n0=[in4] + fcvt.fxu tf[1]=f0 + nop.i 0 } + +{ .mfi; (p8)ldf8 ai4=[aptr],16 // ap[4] + (p9)fcvt.fxu ai4=f0 + mov t[0]=r0 } +{ .mfi; (p10)ldf8 ai5=[r29],16 // ap[5] + (p11)fcvt.fxu ai5=f0 + mov t[1]=r0 } +{ .mfi; (p8)ldf8 bj[3]=[bptr],16 // bp[4] + (p9)fcvt.fxu bj[3]=f0 + mov t[2]=r0 } +{ .mfi; (p10)ldf8 bj[2]=[r30],16 // bp[5] + (p11)fcvt.fxu bj[2]=f0 + mov t[3]=r0 } +{ .mfi; (p8)ldf8 ni4=[nptr],16 // np[4] + (p9)fcvt.fxu ni4=f0 + mov t[4]=r0 } +{ .mfi; (p10)ldf8 ni5=[r31],16 // np[5] + (p11)fcvt.fxu ni5=f0 + mov t[5]=r0 };; + +{ .mfi; (p12)ldf8 ai6=[aptr],16 // ap[6] + (p13)fcvt.fxu ai6=f0 + mov t[6]=r0 } +{ .mfi; (p14)ldf8 ai7=[r29],16 // ap[7] + (p15)fcvt.fxu ai7=f0 + mov t[7]=r0 } +{ .mfi; (p12)ldf8 bj[1]=[bptr],16 // bp[6] + (p13)fcvt.fxu bj[1]=f0 + mov ar.lc=r28 } +{ .mfi; (p14)ldf8 bj[0]=[r30],16 // bp[7] + (p15)fcvt.fxu bj[0]=f0 + mov ar.ec=1 } +{ .mfi; (p12)ldf8 ni6=[nptr],16 // np[6] + (p13)fcvt.fxu ni6=f0 + mov pr.rot=1<<16 } +{ .mfb; (p14)ldf8 ni7=[r31],16 // np[7] + (p15)fcvt.fxu ni7=f0 + brp.loop.imp .Louter_8_ctop,.Louter_8_cend-16 + };; + +// The loop is scheduled for 32*n ticks on Itanium 2. Actual attempt +// to measure with help of Interval Time Counter indicated that the +// factor is a tad higher: 33 or 34, if not 35. Exact measurement and +// addressing the issue is problematic, because I don't have access +// to platform-specific instruction-level profiler. On Itanium it +// should run in 56*n ticks, because of higher xma latency... +.Louter_8_ctop: + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 0: + (p16) xma.hu ahi[0]=ai0,bj[7],tf[1] // ap[0]*b[i]+t[0] + (p40) add a3=a3,n3 } // (p17) a3+=n3 +{ .mfi; (p42) add a3=a3,n3,1 + (p16) xma.lu alo[0]=ai0,bj[7],tf[1] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig a7=alo[8] // 1: + (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 + (p50) add t[6]=t[6],a3,1 };; +{ .mfi; (p17) getf.sig a8=ahi[8] // 2: + (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 + (p40) cmp.ltu p43,p41=a3,n3 } +{ .mfi; (p42) cmp.leu p43,p41=a3,n3 + (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig n5=nlo[6] // 3: + (p48) cmp.ltu p51,p49=t[6],a3 + (p50) cmp.leu p51,p49=t[6],a3 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mfi; (p16) nop.m 0 // 4: + (p16) xma.hu ahi[1]=ai1,bj[7],ahi[0] // ap[1]*b[i] + (p41) add a4=a4,n4 } // (p17) a4+=n4 +{ .mfi; (p43) add a4=a4,n4,1 + (p16) xma.lu alo[1]=ai1,bj[7],ahi[0] + (p16) nop.i 0 };; +{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 + (p16) xmpy.lu mj[0]=alo[0],n0 // (ap[0]*b[i]+t[0])*n0 + (p51) add t[5]=t[5],a4,1 };; +{ .mfi; (p16) nop.m 0 // 6: + (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 + (p41) cmp.ltu p42,p40=a4,n4 } +{ .mfi; (p43) cmp.leu p42,p40=a4,n4 + (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig n6=nlo[7] // 7: + (p49) cmp.ltu p50,p48=t[5],a4 + (p51) cmp.leu p50,p48=t[5],a4 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 8: + (p16) xma.hu ahi[2]=ai2,bj[7],ahi[1] // ap[2]*b[i] + (p40) add a5=a5,n5 } // (p17) a5+=n5 +{ .mfi; (p42) add a5=a5,n5,1 + (p16) xma.lu alo[2]=ai2,bj[7],ahi[1] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a1=alo[1] // 9: + (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 + (p50) add t[4]=t[4],a5,1 };; +{ .mfi; (p16) nop.m 0 // 10: + (p16) xma.hu nhi[0]=ni0,mj[0],alo[0] // np[0]*m0 + (p40) cmp.ltu p43,p41=a5,n5 } +{ .mfi; (p42) cmp.leu p43,p41=a5,n5 + (p16) xma.lu nlo[0]=ni0,mj[0],alo[0] + (p16) nop.i 0 };; +{ .mii; (p17) getf.sig n7=nlo[8] // 11: + (p48) cmp.ltu p51,p49=t[4],a5 + (p50) cmp.leu p51,p49=t[4],a5 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mfi; (p17) getf.sig n8=nhi[8] // 12: + (p16) xma.hu ahi[3]=ai3,bj[7],ahi[2] // ap[3]*b[i] + (p41) add a6=a6,n6 } // (p17) a6+=n6 +{ .mfi; (p43) add a6=a6,n6,1 + (p16) xma.lu alo[3]=ai3,bj[7],ahi[2] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a2=alo[2] // 13: + (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 + (p51) add t[3]=t[3],a6,1 };; +{ .mfi; (p16) nop.m 0 // 14: + (p16) xma.hu nhi[1]=ni1,mj[0],nhi[0] // np[1]*m0 + (p41) cmp.ltu p42,p40=a6,n6 } +{ .mfi; (p43) cmp.leu p42,p40=a6,n6 + (p16) xma.lu nlo[1]=ni1,mj[0],nhi[0] + (p16) nop.i 0 };; +{ .mii; (p16) nop.m 0 // 15: + (p49) cmp.ltu p50,p48=t[3],a6 + (p51) cmp.leu p50,p48=t[3],a6 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 16: + (p16) xma.hu ahi[4]=ai4,bj[7],ahi[3] // ap[4]*b[i] + (p40) add a7=a7,n7 } // (p17) a7+=n7 +{ .mfi; (p42) add a7=a7,n7,1 + (p16) xma.lu alo[4]=ai4,bj[7],ahi[3] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a3=alo[3] // 17: + (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 + (p50) add t[2]=t[2],a7,1 };; +{ .mfi; (p16) nop.m 0 // 18: + (p16) xma.hu nhi[2]=ni2,mj[0],nhi[1] // np[2]*m0 + (p40) cmp.ltu p43,p41=a7,n7 } +{ .mfi; (p42) cmp.leu p43,p41=a7,n7 + (p16) xma.lu nlo[2]=ni2,mj[0],nhi[1] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig n1=nlo[1] // 19: + (p48) cmp.ltu p51,p49=t[2],a7 + (p50) cmp.leu p51,p49=t[2],a7 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mfi; (p16) nop.m 0 // 20: + (p16) xma.hu ahi[5]=ai5,bj[7],ahi[4] // ap[5]*b[i] + (p41) add a8=a8,n8 } // (p17) a8+=n8 +{ .mfi; (p43) add a8=a8,n8,1 + (p16) xma.lu alo[5]=ai5,bj[7],ahi[4] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a4=alo[4] // 21: + (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 + (p51) add t[1]=t[1],a8,1 };; +{ .mfi; (p16) nop.m 0 // 22: + (p16) xma.hu nhi[3]=ni3,mj[0],nhi[2] // np[3]*m0 + (p41) cmp.ltu p42,p40=a8,n8 } +{ .mfi; (p43) cmp.leu p42,p40=a8,n8 + (p16) xma.lu nlo[3]=ni3,mj[0],nhi[2] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig n2=nlo[2] // 23: + (p49) cmp.ltu p50,p48=t[1],a8 + (p51) cmp.leu p50,p48=t[1],a8 };; +{ .mfi; (p16) nop.m 0 // 24: + (p16) xma.hu ahi[6]=ai6,bj[7],ahi[5] // ap[6]*b[i] + (p16) add a1=a1,n1 } // (p16) a1+=n1 +{ .mfi; (p16) nop.m 0 + (p16) xma.lu alo[6]=ai6,bj[7],ahi[5] + (p17) mov t[0]=r0 };; +{ .mii; (p16) getf.sig a5=alo[5] // 25: + (p16) add t0=t[7],a1 // (p16) t[7]+=a1 + (p42) add t[0]=t[0],r0,1 };; +{ .mfi; (p16) setf.sig tf[0]=t0 // 26: + (p16) xma.hu nhi[4]=ni4,mj[0],nhi[3] // np[4]*m0 + (p50) add t[0]=t[0],r0,1 } +{ .mfi; (p16) cmp.ltu.unc p42,p40=a1,n1 + (p16) xma.lu nlo[4]=ni4,mj[0],nhi[3] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig n3=nlo[3] // 27: + (p16) cmp.ltu.unc p50,p48=t0,a1 + (p16) nop.i 0 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mfi; (p16) nop.m 0 // 28: + (p16) xma.hu ahi[7]=ai7,bj[7],ahi[6] // ap[7]*b[i] + (p40) add a2=a2,n2 } // (p16) a2+=n2 +{ .mfi; (p42) add a2=a2,n2,1 + (p16) xma.lu alo[7]=ai7,bj[7],ahi[6] + (p16) nop.i 0 };; +{ .mii; (p16) getf.sig a6=alo[6] // 29: + (p48) add t[6]=t[6],a2 // (p16) t[6]+=a2 + (p50) add t[6]=t[6],a2,1 };; +{ .mfi; (p16) nop.m 0 // 30: + (p16) xma.hu nhi[5]=ni5,mj[0],nhi[4] // np[5]*m0 + (p40) cmp.ltu p41,p39=a2,n2 } +{ .mfi; (p42) cmp.leu p41,p39=a2,n2 + (p16) xma.lu nlo[5]=ni5,mj[0],nhi[4] + (p16) nop.i 0 };; +{ .mfi; (p16) getf.sig n4=nlo[4] // 31: + (p16) nop.f 0 + (p48) cmp.ltu p49,p47=t[6],a2 } +{ .mfb; (p50) cmp.leu p49,p47=t[6],a2 + (p16) nop.f 0 + br.ctop.sptk.many .Louter_8_ctop };; +.Louter_8_cend: + +// above loop has to execute one more time, without (p16), which is +// replaced with merged move of np[8] to GPR bank + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mmi; (p0) getf.sig n1=ni0 // 0: + (p40) add a3=a3,n3 // (p17) a3+=n3 + (p42) add a3=a3,n3,1 };; +{ .mii; (p17) getf.sig a7=alo[8] // 1: + (p48) add t[6]=t[6],a3 // (p17) t[6]+=a3 + (p50) add t[6]=t[6],a3,1 };; +{ .mfi; (p17) getf.sig a8=ahi[8] // 2: + (p17) xma.hu nhi[7]=ni6,mj[1],nhi[6] // np[6]*m0 + (p40) cmp.ltu p43,p41=a3,n3 } +{ .mfi; (p42) cmp.leu p43,p41=a3,n3 + (p17) xma.lu nlo[7]=ni6,mj[1],nhi[6] + (p0) nop.i 0 };; +{ .mii; (p17) getf.sig n5=nlo[6] // 3: + (p48) cmp.ltu p51,p49=t[6],a3 + (p50) cmp.leu p51,p49=t[6],a3 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mmi; (p0) getf.sig n2=ni1 // 4: + (p41) add a4=a4,n4 // (p17) a4+=n4 + (p43) add a4=a4,n4,1 };; +{ .mfi; (p49) add t[5]=t[5],a4 // 5: (p17) t[5]+=a4 + (p0) nop.f 0 + (p51) add t[5]=t[5],a4,1 };; +{ .mfi; (p0) getf.sig n3=ni2 // 6: + (p17) xma.hu nhi[8]=ni7,mj[1],nhi[7] // np[7]*m0 + (p41) cmp.ltu p42,p40=a4,n4 } +{ .mfi; (p43) cmp.leu p42,p40=a4,n4 + (p17) xma.lu nlo[8]=ni7,mj[1],nhi[7] + (p0) nop.i 0 };; +{ .mii; (p17) getf.sig n6=nlo[7] // 7: + (p49) cmp.ltu p50,p48=t[5],a4 + (p51) cmp.leu p50,p48=t[5],a4 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mii; (p0) getf.sig n4=ni3 // 8: + (p40) add a5=a5,n5 // (p17) a5+=n5 + (p42) add a5=a5,n5,1 };; +{ .mii; (p0) nop.m 0 // 9: + (p48) add t[4]=t[4],a5 // p(17) t[4]+=a5 + (p50) add t[4]=t[4],a5,1 };; +{ .mii; (p0) nop.m 0 // 10: + (p40) cmp.ltu p43,p41=a5,n5 + (p42) cmp.leu p43,p41=a5,n5 };; +{ .mii; (p17) getf.sig n7=nlo[8] // 11: + (p48) cmp.ltu p51,p49=t[4],a5 + (p50) cmp.leu p51,p49=t[4],a5 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mii; (p17) getf.sig n8=nhi[8] // 12: + (p41) add a6=a6,n6 // (p17) a6+=n6 + (p43) add a6=a6,n6,1 };; +{ .mii; (p0) getf.sig n5=ni4 // 13: + (p49) add t[3]=t[3],a6 // (p17) t[3]+=a6 + (p51) add t[3]=t[3],a6,1 };; +{ .mii; (p0) nop.m 0 // 14: + (p41) cmp.ltu p42,p40=a6,n6 + (p43) cmp.leu p42,p40=a6,n6 };; +{ .mii; (p0) getf.sig n6=ni5 // 15: + (p49) cmp.ltu p50,p48=t[3],a6 + (p51) cmp.leu p50,p48=t[3],a6 };; + .pred.rel "mutex",p40,p42 + .pred.rel "mutex",p48,p50 +{ .mii; (p0) nop.m 0 // 16: + (p40) add a7=a7,n7 // (p17) a7+=n7 + (p42) add a7=a7,n7,1 };; +{ .mii; (p0) nop.m 0 // 17: + (p48) add t[2]=t[2],a7 // (p17) t[2]+=a7 + (p50) add t[2]=t[2],a7,1 };; +{ .mii; (p0) nop.m 0 // 18: + (p40) cmp.ltu p43,p41=a7,n7 + (p42) cmp.leu p43,p41=a7,n7 };; +{ .mii; (p0) getf.sig n7=ni6 // 19: + (p48) cmp.ltu p51,p49=t[2],a7 + (p50) cmp.leu p51,p49=t[2],a7 };; + .pred.rel "mutex",p41,p43 + .pred.rel "mutex",p49,p51 +{ .mii; (p0) nop.m 0 // 20: + (p41) add a8=a8,n8 // (p17) a8+=n8 + (p43) add a8=a8,n8,1 };; +{ .mmi; (p0) nop.m 0 // 21: + (p49) add t[1]=t[1],a8 // (p17) t[1]+=a8 + (p51) add t[1]=t[1],a8,1 } +{ .mmi; (p17) mov t[0]=r0 + (p41) cmp.ltu p42,p40=a8,n8 + (p43) cmp.leu p42,p40=a8,n8 };; +{ .mmi; (p0) getf.sig n8=ni7 // 22: + (p49) cmp.ltu p50,p48=t[1],a8 + (p51) cmp.leu p50,p48=t[1],a8 } +{ .mmi; (p42) add t[0]=t[0],r0,1 + (p0) add r16=-7*16,prevsp + (p0) add r17=-6*16,prevsp };; + +// subtract np[8] from carrybit|tmp[8] +// carrybit|tmp[8] layout upon exit from above loop is: +// t[0]|t[1]|t[2]|t[3]|t[4]|t[5]|t[6]|t[7]|t0 (least significant) +{ .mmi; (p50)add t[0]=t[0],r0,1 + add r18=-5*16,prevsp + sub n1=t0,n1 };; +{ .mmi; cmp.gtu p34,p32=n1,t0;; + .pred.rel "mutex",p32,p34 + (p32)sub n2=t[7],n2 + (p34)sub n2=t[7],n2,1 };; +{ .mii; (p32)cmp.gtu p35,p33=n2,t[7] + (p34)cmp.geu p35,p33=n2,t[7];; + .pred.rel "mutex",p33,p35 + (p33)sub n3=t[6],n3 } +{ .mmi; (p35)sub n3=t[6],n3,1;; + (p33)cmp.gtu p34,p32=n3,t[6] + (p35)cmp.geu p34,p32=n3,t[6] };; + .pred.rel "mutex",p32,p34 +{ .mii; (p32)sub n4=t[5],n4 + (p34)sub n4=t[5],n4,1;; + (p32)cmp.gtu p35,p33=n4,t[5] } +{ .mmi; (p34)cmp.geu p35,p33=n4,t[5];; + .pred.rel "mutex",p33,p35 + (p33)sub n5=t[4],n5 + (p35)sub n5=t[4],n5,1 };; +{ .mii; (p33)cmp.gtu p34,p32=n5,t[4] + (p35)cmp.geu p34,p32=n5,t[4];; + .pred.rel "mutex",p32,p34 + (p32)sub n6=t[3],n6 } +{ .mmi; (p34)sub n6=t[3],n6,1;; + (p32)cmp.gtu p35,p33=n6,t[3] + (p34)cmp.geu p35,p33=n6,t[3] };; + .pred.rel "mutex",p33,p35 +{ .mii; (p33)sub n7=t[2],n7 + (p35)sub n7=t[2],n7,1;; + (p33)cmp.gtu p34,p32=n7,t[2] } +{ .mmi; (p35)cmp.geu p34,p32=n7,t[2];; + .pred.rel "mutex",p32,p34 + (p32)sub n8=t[1],n8 + (p34)sub n8=t[1],n8,1 };; +{ .mii; (p32)cmp.gtu p35,p33=n8,t[1] + (p34)cmp.geu p35,p33=n8,t[1];; + .pred.rel "mutex",p33,p35 + (p33)sub a8=t[0],r0 } +{ .mmi; (p35)sub a8=t[0],r0,1;; + (p33)cmp.gtu p34,p32=a8,t[0] + (p35)cmp.geu p34,p32=a8,t[0] };; + +// save the result, either tmp[num] or tmp[num]-np[num] + .pred.rel "mutex",p32,p34 +{ .mmi; (p32)st8 [rptr]=n1,8 + (p34)st8 [rptr]=t0,8 + add r19=-4*16,prevsp};; +{ .mmb; (p32)st8 [rptr]=n2,8 + (p34)st8 [rptr]=t[7],8 + (p5)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n3,8 + (p34)st8 [rptr]=t[6],8 + (p7)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n4,8 + (p34)st8 [rptr]=t[5],8 + (p9)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n5,8 + (p34)st8 [rptr]=t[4],8 + (p11)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n6,8 + (p34)st8 [rptr]=t[3],8 + (p13)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n7,8 + (p34)st8 [rptr]=t[2],8 + (p15)br.cond.dpnt.few .Ldone };; +{ .mmb; (p32)st8 [rptr]=n8,8 + (p34)st8 [rptr]=t[1],8 + nop.b 0 };; +.Ldone: // epilogue +{ .mmi; ldf.fill f16=[r16],64 + ldf.fill f17=[r17],64 + nop.i 0 } +{ .mmi; ldf.fill f18=[r18],64 + ldf.fill f19=[r19],64 + mov pr=prevpr,0x1ffff };; +{ .mmi; ldf.fill f20=[r16] + ldf.fill f21=[r17] + mov ar.lc=prevlc } +{ .mmi; ldf.fill f22=[r18] + ldf.fill f23=[r19] + mov ret0=1 } // signal "handled" +{ .mib; rum 1<<5 + .restore sp + mov sp=prevsp + br.ret.sptk.many b0 };; +.endp bn_mul_mont_8# + +.type copyright#,\@object +copyright: +stringz "Montgomery multiplication for IA-64, CRYPTOGAMS by " +___ + +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/mips-mont.pl b/src/lib/libcrypto/bn/asm/mips-mont.pl new file mode 100644 index 0000000000..b944a12b8e --- /dev/null +++ b/src/lib/libcrypto/bn/asm/mips-mont.pl @@ -0,0 +1,426 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# This module doesn't present direct interest for OpenSSL, because it +# doesn't provide better performance for longer keys, at least not on +# in-order-execution cores. While 512-bit RSA sign operations can be +# 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and +# 4096-bit ones are up to 15% slower. In 32-bit mode it varies from +# 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA +# verify:-( All comparisons are against bn_mul_mont-free assembler. +# The module might be of interest to embedded system developers, as +# the code is smaller than 1KB, yet offers >3x improvement on MIPS64 +# and 75-30% [less for longer keys] on MIPS32 over compiler-generated +# code. + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $SZREG=4; +} +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000; +# +# +# +###################################################################### + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($flavour =~ /64|n32/i) { + $LD="ld"; + $ST="sd"; + $MULTU="dmultu"; + $ADDU="daddu"; + $SUBU="dsubu"; + $BNSZ=8; +} else { + $LD="lw"; + $ST="sw"; + $MULTU="multu"; + $ADDU="addu"; + $SUBU="subu"; + $BNSZ=4; +} + +# int bn_mul_mont( +$rp=$a0; # BN_ULONG *rp, +$ap=$a1; # const BN_ULONG *ap, +$bp=$a2; # const BN_ULONG *bp, +$np=$a3; # const BN_ULONG *np, +$n0=$a4; # const BN_ULONG *n0, +$num=$a5; # int num); + +$lo0=$a6; +$hi0=$a7; +$lo1=$t1; +$hi1=$t2; +$aj=$s0; +$bi=$s1; +$nj=$s2; +$tp=$s3; +$alo=$s4; +$ahi=$s5; +$nlo=$s6; +$nhi=$s7; +$tj=$s8; +$i=$s9; +$j=$s10; +$m1=$s11; + +$FRAMESIZE=14; + +$code=<<___; +.text + +.set noat +.set noreorder + +.align 5 +.globl bn_mul_mont +.ent bn_mul_mont +bn_mul_mont: +___ +$code.=<<___ if ($flavour =~ /o32/i); + lw $n0,16($sp) + lw $num,20($sp) +___ +$code.=<<___; + slt $at,$num,4 + bnez $at,1f + li $t0,0 + slt $at,$num,17 # on in-order CPU + bnezl $at,bn_mul_mont_internal + nop +1: jr $ra + li $a0,0 +.end bn_mul_mont + +.align 5 +.ent bn_mul_mont_internal +bn_mul_mont_internal: + .frame $fp,$FRAMESIZE*$SZREG,$ra + .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp) +___ +$code.=<<___; + move $fp,$sp + + .set reorder + $LD $n0,0($n0) + $LD $bi,0($bp) # bp[0] + $LD $aj,0($ap) # ap[0] + $LD $nj,0($np) # np[0] + + $PTR_SUB $sp,2*$BNSZ # place for two extra words + sll $num,`log($BNSZ)/log(2)` + li $at,-4096 + $PTR_SUB $sp,$num + and $sp,$at + + $MULTU $aj,$bi + $LD $alo,$BNSZ($ap) + $LD $nlo,$BNSZ($np) + mflo $lo0 + mfhi $hi0 + $MULTU $lo0,$n0 + mflo $m1 + + $MULTU $alo,$bi + mflo $alo + mfhi $ahi + + $MULTU $nj,$m1 + mflo $lo1 + mfhi $hi1 + $MULTU $nlo,$m1 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + mflo $nlo + mfhi $nhi + + move $tp,$sp + li $j,2*$BNSZ +.align 4 +.L1st: + .set noreorder + $PTR_ADD $aj,$ap,$j + $PTR_ADD $nj,$np,$j + $LD $aj,($aj) + $LD $nj,($nj) + + $MULTU $aj,$bi + $ADDU $lo0,$alo,$hi0 + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo0,$hi0 + sltu $t0,$lo1,$hi1 + $ADDU $hi0,$ahi,$at + $ADDU $hi1,$nhi,$t0 + mflo $alo + mfhi $ahi + + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $MULTU $nj,$m1 + $ADDU $hi1,$at + addu $j,$BNSZ + $ST $lo1,($tp) + sltu $t0,$j,$num + mflo $nlo + mfhi $nhi + + bnez $t0,.L1st + $PTR_ADD $tp,$BNSZ + .set reorder + + $ADDU $lo0,$alo,$hi0 + sltu $at,$lo0,$hi0 + $ADDU $hi0,$ahi,$at + + $ADDU $lo1,$nlo,$hi1 + sltu $t0,$lo1,$hi1 + $ADDU $hi1,$nhi,$t0 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + + $ST $lo1,($tp) + + $ADDU $hi1,$hi0 + sltu $at,$hi1,$hi0 + $ST $hi1,$BNSZ($tp) + $ST $at,2*$BNSZ($tp) + + li $i,$BNSZ +.align 4 +.Louter: + $PTR_ADD $bi,$bp,$i + $LD $bi,($bi) + $LD $aj,($ap) + $LD $alo,$BNSZ($ap) + $LD $tj,($sp) + + $MULTU $aj,$bi + $LD $nj,($np) + $LD $nlo,$BNSZ($np) + mflo $lo0 + mfhi $hi0 + $ADDU $lo0,$tj + $MULTU $lo0,$n0 + sltu $at,$lo0,$tj + $ADDU $hi0,$at + mflo $m1 + + $MULTU $alo,$bi + mflo $alo + mfhi $ahi + + $MULTU $nj,$m1 + mflo $lo1 + mfhi $hi1 + + $MULTU $nlo,$m1 + $ADDU $lo1,$lo0 + sltu $at,$lo1,$lo0 + $ADDU $hi1,$at + mflo $nlo + mfhi $nhi + + move $tp,$sp + li $j,2*$BNSZ + $LD $tj,$BNSZ($tp) +.align 4 +.Linner: + .set noreorder + $PTR_ADD $aj,$ap,$j + $PTR_ADD $nj,$np,$j + $LD $aj,($aj) + $LD $nj,($nj) + + $MULTU $aj,$bi + $ADDU $lo0,$alo,$hi0 + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo0,$hi0 + sltu $t0,$lo1,$hi1 + $ADDU $hi0,$ahi,$at + $ADDU $hi1,$nhi,$t0 + mflo $alo + mfhi $ahi + + $ADDU $lo0,$tj + addu $j,$BNSZ + $MULTU $nj,$m1 + sltu $at,$lo0,$tj + $ADDU $lo1,$lo0 + $ADDU $hi0,$at + sltu $t0,$lo1,$lo0 + $LD $tj,2*$BNSZ($tp) + $ADDU $hi1,$t0 + sltu $at,$j,$num + mflo $nlo + mfhi $nhi + $ST $lo1,($tp) + bnez $at,.Linner + $PTR_ADD $tp,$BNSZ + .set reorder + + $ADDU $lo0,$alo,$hi0 + sltu $at,$lo0,$hi0 + $ADDU $hi0,$ahi,$at + $ADDU $lo0,$tj + sltu $t0,$lo0,$tj + $ADDU $hi0,$t0 + + $LD $tj,2*$BNSZ($tp) + $ADDU $lo1,$nlo,$hi1 + sltu $at,$lo1,$hi1 + $ADDU $hi1,$nhi,$at + $ADDU $lo1,$lo0 + sltu $t0,$lo1,$lo0 + $ADDU $hi1,$t0 + $ST $lo1,($tp) + + $ADDU $lo1,$hi1,$hi0 + sltu $hi1,$lo1,$hi0 + $ADDU $lo1,$tj + sltu $at,$lo1,$tj + $ADDU $hi1,$at + $ST $lo1,$BNSZ($tp) + $ST $hi1,2*$BNSZ($tp) + + addu $i,$BNSZ + sltu $t0,$i,$num + bnez $t0,.Louter + + .set noreorder + $PTR_ADD $tj,$sp,$num # &tp[num] + move $tp,$sp + move $ap,$sp + li $hi0,0 # clear borrow bit + +.align 4 +.Lsub: $LD $lo0,($tp) + $LD $lo1,($np) + $PTR_ADD $tp,$BNSZ + $PTR_ADD $np,$BNSZ + $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i] + sgtu $at,$lo1,$lo0 + $SUBU $lo0,$lo1,$hi0 + sgtu $hi0,$lo0,$lo1 + $ST $lo0,($rp) + or $hi0,$at + sltu $at,$tp,$tj + bnez $at,.Lsub + $PTR_ADD $rp,$BNSZ + + $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit + move $tp,$sp + $PTR_SUB $rp,$num # restore rp + not $hi1,$hi0 + + and $ap,$hi0,$sp + and $bp,$hi1,$rp + or $ap,$ap,$bp # ap=borrow?tp:rp + +.align 4 +.Lcopy: $LD $aj,($ap) + $PTR_ADD $ap,$BNSZ + $ST $zero,($tp) + $PTR_ADD $tp,$BNSZ + sltu $at,$tp,$tj + $ST $aj,($rp) + bnez $at,.Lcopy + $PTR_ADD $rp,$BNSZ + + li $a0,1 + li $t0,1 + + .set noreorder + move $sp,$fp + $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end bn_mul_mont_internal +.rdata +.asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/mips.pl b/src/lib/libcrypto/bn/asm/mips.pl new file mode 100644 index 0000000000..c162a3ec23 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/mips.pl @@ -0,0 +1,2585 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. +# +# Rights for redistribution and usage in source and binary forms are +# granted according to the OpenSSL license. Warranty of any kind is +# disclaimed. +# ==================================================================== + + +# July 1999 +# +# This is drop-in MIPS III/IV ISA replacement for crypto/bn/bn_asm.c. +# +# The module is designed to work with either of the "new" MIPS ABI(5), +# namely N32 or N64, offered by IRIX 6.x. It's not ment to work under +# IRIX 5.x not only because it doesn't support new ABIs but also +# because 5.x kernels put R4x00 CPU into 32-bit mode and all those +# 64-bit instructions (daddu, dmultu, etc.) found below gonna only +# cause illegal instruction exception:-( +# +# In addition the code depends on preprocessor flags set up by MIPSpro +# compiler driver (either as or cc) and therefore (probably?) can't be +# compiled by the GNU assembler. GNU C driver manages fine though... +# I mean as long as -mmips-as is specified or is the default option, +# because then it simply invokes /usr/bin/as which in turn takes +# perfect care of the preprocessor definitions. Another neat feature +# offered by the MIPSpro assembler is an optimization pass. This gave +# me the opportunity to have the code looking more regular as all those +# architecture dependent instruction rescheduling details were left to +# the assembler. Cool, huh? +# +# Performance improvement is astonishing! 'apps/openssl speed rsa dsa' +# goes way over 3 times faster! +# +# + +# October 2010 +# +# Adapt the module even for 32-bit ABIs and other OSes. The former was +# achieved by mechanical replacement of 64-bit arithmetic instructions +# such as dmultu, daddu, etc. with their 32-bit counterparts and +# adjusting offsets denoting multiples of BN_ULONG. Above mentioned +# >3x performance improvement naturally does not apply to 32-bit code +# [because there is no instruction 32-bit compiler can't use], one +# has to content with 40-85% improvement depending on benchmark and +# key length, more for longer keys. + +$flavour = shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +if ($flavour =~ /64|n32/i) { + $LD="ld"; + $ST="sd"; + $MULTU="dmultu"; + $DIVU="ddivu"; + $ADDU="daddu"; + $SUBU="dsubu"; + $SRL="dsrl"; + $SLL="dsll"; + $BNSZ=8; + $PTR_ADD="daddu"; + $PTR_SUB="dsubu"; + $SZREG=8; + $REG_S="sd"; + $REG_L="ld"; +} else { + $LD="lw"; + $ST="sw"; + $MULTU="multu"; + $DIVU="divu"; + $ADDU="addu"; + $SUBU="subu"; + $SRL="srl"; + $SLL="sll"; + $BNSZ=4; + $PTR_ADD="addu"; + $PTR_SUB="subu"; + $SZREG=4; + $REG_S="sw"; + $REG_L="lw"; + $code=".set mips2\n"; +} + +# Below is N32/64 register layout used in the original module. +# +($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +($ta0,$ta1,$ta2,$ta3)=($a4,$a5,$a6,$a7); +# +# No special adaptation is required for O32. NUBI on the other hand +# is treated by saving/restoring ($v1,$t0..$t3). + +$gp=$v1 if ($flavour =~ /nubi/i); + +$minus4=$v1; + +$code.=<<___; +.rdata +.asciiz "mips3.s, Version 1.2" +.asciiz "MIPS II/III/IV ISA artwork by Andy Polyakov " + +.text +.set noat + +.align 5 +.globl bn_mul_add_words +.ent bn_mul_add_words +bn_mul_add_words: + .set noreorder + bgtz $a2,bn_mul_add_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_mul_add_words + +.align 5 +.ent bn_mul_add_words_internal +bn_mul_add_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_mul_add_words_tail + +.L_bn_mul_add_words_loop: + $MULTU $t0,$a3 + $LD $t1,0($a0) + $LD $t2,$BNSZ($a1) + $LD $t3,$BNSZ($a0) + $LD $ta0,2*$BNSZ($a1) + $LD $ta1,2*$BNSZ($a0) + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 # All manuals say it "compares 32-bit + # values", but it seems to work fine + # even on 64-bit registers. + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + $MULTU $t2,$a3 + sltu $at,$t1,$at + $ST $t1,0($a0) + $ADDU $v0,$at + + $LD $ta2,3*$BNSZ($a1) + $LD $ta3,3*$BNSZ($a0) + $ADDU $t3,$v0 + sltu $v0,$t3,$v0 + mflo $at + mfhi $t2 + $ADDU $t3,$at + $ADDU $v0,$t2 + $MULTU $ta0,$a3 + sltu $at,$t3,$at + $ST $t3,$BNSZ($a0) + $ADDU $v0,$at + + subu $a2,4 + $PTR_ADD $a0,4*$BNSZ + $PTR_ADD $a1,4*$BNSZ + $ADDU $ta1,$v0 + sltu $v0,$ta1,$v0 + mflo $at + mfhi $ta0 + $ADDU $ta1,$at + $ADDU $v0,$ta0 + $MULTU $ta2,$a3 + sltu $at,$ta1,$at + $ST $ta1,-2*$BNSZ($a0) + $ADDU $v0,$at + + + and $ta0,$a2,$minus4 + $ADDU $ta3,$v0 + sltu $v0,$ta3,$v0 + mflo $at + mfhi $ta2 + $ADDU $ta3,$at + $ADDU $v0,$ta2 + sltu $at,$ta3,$at + $ST $ta3,-$BNSZ($a0) + $ADDU $v0,$at + .set noreorder + bgtzl $ta0,.L_bn_mul_add_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_mul_add_words_return + nop + +.L_bn_mul_add_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$a3 + $LD $t1,0($a0) + subu $a2,1 + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,0($a0) + $ADDU $v0,$at + beqz $a2,.L_bn_mul_add_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$a3 + $LD $t1,$BNSZ($a0) + subu $a2,1 + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,$BNSZ($a0) + $ADDU $v0,$at + beqz $a2,.L_bn_mul_add_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$a3 + $LD $t1,2*$BNSZ($a0) + $ADDU $t1,$v0 + sltu $v0,$t1,$v0 + mflo $at + mfhi $t0 + $ADDU $t1,$at + $ADDU $v0,$t0 + sltu $at,$t1,$at + $ST $t1,2*$BNSZ($a0) + $ADDU $v0,$at + +.L_bn_mul_add_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_mul_add_words_internal + +.align 5 +.globl bn_mul_words +.ent bn_mul_words +bn_mul_words: + .set noreorder + bgtz $a2,bn_mul_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_mul_words + +.align 5 +.ent bn_mul_words_internal +bn_mul_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_mul_words_tail + +.L_bn_mul_words_loop: + $MULTU $t0,$a3 + $LD $t2,$BNSZ($a1) + $LD $ta0,2*$BNSZ($a1) + $LD $ta2,3*$BNSZ($a1) + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $MULTU $t2,$a3 + $ST $v0,0($a0) + $ADDU $v0,$t1,$t0 + + subu $a2,4 + $PTR_ADD $a0,4*$BNSZ + $PTR_ADD $a1,4*$BNSZ + mflo $at + mfhi $t2 + $ADDU $v0,$at + sltu $t3,$v0,$at + $MULTU $ta0,$a3 + $ST $v0,-3*$BNSZ($a0) + $ADDU $v0,$t3,$t2 + + mflo $at + mfhi $ta0 + $ADDU $v0,$at + sltu $ta1,$v0,$at + $MULTU $ta2,$a3 + $ST $v0,-2*$BNSZ($a0) + $ADDU $v0,$ta1,$ta0 + + and $ta0,$a2,$minus4 + mflo $at + mfhi $ta2 + $ADDU $v0,$at + sltu $ta3,$v0,$at + $ST $v0,-$BNSZ($a0) + $ADDU $v0,$ta3,$ta2 + .set noreorder + bgtzl $ta0,.L_bn_mul_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_mul_words_return + nop + +.L_bn_mul_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$a3 + subu $a2,1 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,0($a0) + $ADDU $v0,$t1,$t0 + beqz $a2,.L_bn_mul_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$a3 + subu $a2,1 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,$BNSZ($a0) + $ADDU $v0,$t1,$t0 + beqz $a2,.L_bn_mul_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$a3 + mflo $at + mfhi $t0 + $ADDU $v0,$at + sltu $t1,$v0,$at + $ST $v0,2*$BNSZ($a0) + $ADDU $v0,$t1,$t0 + +.L_bn_mul_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_mul_words_internal + +.align 5 +.globl bn_sqr_words +.ent bn_sqr_words +bn_sqr_words: + .set noreorder + bgtz $a2,bn_sqr_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_sqr_words + +.align 5 +.ent bn_sqr_words_internal +bn_sqr_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $ta0,$a2,$minus4 + $LD $t0,0($a1) + beqz $ta0,.L_bn_sqr_words_tail + +.L_bn_sqr_words_loop: + $MULTU $t0,$t0 + $LD $t2,$BNSZ($a1) + $LD $ta0,2*$BNSZ($a1) + $LD $ta2,3*$BNSZ($a1) + mflo $t1 + mfhi $t0 + $ST $t1,0($a0) + $ST $t0,$BNSZ($a0) + + $MULTU $t2,$t2 + subu $a2,4 + $PTR_ADD $a0,8*$BNSZ + $PTR_ADD $a1,4*$BNSZ + mflo $t3 + mfhi $t2 + $ST $t3,-6*$BNSZ($a0) + $ST $t2,-5*$BNSZ($a0) + + $MULTU $ta0,$ta0 + mflo $ta1 + mfhi $ta0 + $ST $ta1,-4*$BNSZ($a0) + $ST $ta0,-3*$BNSZ($a0) + + + $MULTU $ta2,$ta2 + and $ta0,$a2,$minus4 + mflo $ta3 + mfhi $ta2 + $ST $ta3,-2*$BNSZ($a0) + $ST $ta2,-$BNSZ($a0) + + .set noreorder + bgtzl $ta0,.L_bn_sqr_words_loop + $LD $t0,0($a1) + + beqz $a2,.L_bn_sqr_words_return + nop + +.L_bn_sqr_words_tail: + .set reorder + $LD $t0,0($a1) + $MULTU $t0,$t0 + subu $a2,1 + mflo $t1 + mfhi $t0 + $ST $t1,0($a0) + $ST $t0,$BNSZ($a0) + beqz $a2,.L_bn_sqr_words_return + + $LD $t0,$BNSZ($a1) + $MULTU $t0,$t0 + subu $a2,1 + mflo $t1 + mfhi $t0 + $ST $t1,2*$BNSZ($a0) + $ST $t0,3*$BNSZ($a0) + beqz $a2,.L_bn_sqr_words_return + + $LD $t0,2*$BNSZ($a1) + $MULTU $t0,$t0 + mflo $t1 + mfhi $t0 + $ST $t1,4*$BNSZ($a0) + $ST $t0,5*$BNSZ($a0) + +.L_bn_sqr_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 + +.end bn_sqr_words_internal + +.align 5 +.globl bn_add_words +.ent bn_add_words +bn_add_words: + .set noreorder + bgtz $a3,bn_add_words_internal + move $v0,$zero + jr $ra + move $a0,$v0 +.end bn_add_words + +.align 5 +.ent bn_add_words_internal +bn_add_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $at,$a3,$minus4 + $LD $t0,0($a1) + beqz $at,.L_bn_add_words_tail + +.L_bn_add_words_loop: + $LD $ta0,0($a2) + subu $a3,4 + $LD $t1,$BNSZ($a1) + and $at,$a3,$minus4 + $LD $t2,2*$BNSZ($a1) + $PTR_ADD $a2,4*$BNSZ + $LD $t3,3*$BNSZ($a1) + $PTR_ADD $a0,4*$BNSZ + $LD $ta1,-3*$BNSZ($a2) + $PTR_ADD $a1,4*$BNSZ + $LD $ta2,-2*$BNSZ($a2) + $LD $ta3,-$BNSZ($a2) + $ADDU $ta0,$t0 + sltu $t8,$ta0,$t0 + $ADDU $t0,$ta0,$v0 + sltu $v0,$t0,$ta0 + $ST $t0,-4*$BNSZ($a0) + $ADDU $v0,$t8 + + $ADDU $ta1,$t1 + sltu $t9,$ta1,$t1 + $ADDU $t1,$ta1,$v0 + sltu $v0,$t1,$ta1 + $ST $t1,-3*$BNSZ($a0) + $ADDU $v0,$t9 + + $ADDU $ta2,$t2 + sltu $t8,$ta2,$t2 + $ADDU $t2,$ta2,$v0 + sltu $v0,$t2,$ta2 + $ST $t2,-2*$BNSZ($a0) + $ADDU $v0,$t8 + + $ADDU $ta3,$t3 + sltu $t9,$ta3,$t3 + $ADDU $t3,$ta3,$v0 + sltu $v0,$t3,$ta3 + $ST $t3,-$BNSZ($a0) + $ADDU $v0,$t9 + + .set noreorder + bgtzl $at,.L_bn_add_words_loop + $LD $t0,0($a1) + + beqz $a3,.L_bn_add_words_return + nop + +.L_bn_add_words_tail: + .set reorder + $LD $t0,0($a1) + $LD $ta0,0($a2) + $ADDU $ta0,$t0 + subu $a3,1 + sltu $t8,$ta0,$t0 + $ADDU $t0,$ta0,$v0 + sltu $v0,$t0,$ta0 + $ST $t0,0($a0) + $ADDU $v0,$t8 + beqz $a3,.L_bn_add_words_return + + $LD $t1,$BNSZ($a1) + $LD $ta1,$BNSZ($a2) + $ADDU $ta1,$t1 + subu $a3,1 + sltu $t9,$ta1,$t1 + $ADDU $t1,$ta1,$v0 + sltu $v0,$t1,$ta1 + $ST $t1,$BNSZ($a0) + $ADDU $v0,$t9 + beqz $a3,.L_bn_add_words_return + + $LD $t2,2*$BNSZ($a1) + $LD $ta2,2*$BNSZ($a2) + $ADDU $ta2,$t2 + sltu $t8,$ta2,$t2 + $ADDU $t2,$ta2,$v0 + sltu $v0,$t2,$ta2 + $ST $t2,2*$BNSZ($a0) + $ADDU $v0,$t8 + +.L_bn_add_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 + +.end bn_add_words_internal + +.align 5 +.globl bn_sub_words +.ent bn_sub_words +bn_sub_words: + .set noreorder + bgtz $a3,bn_sub_words_internal + move $v0,$zero + jr $ra + move $a0,$zero +.end bn_sub_words + +.align 5 +.ent bn_sub_words_internal +bn_sub_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + li $minus4,-4 + and $at,$a3,$minus4 + $LD $t0,0($a1) + beqz $at,.L_bn_sub_words_tail + +.L_bn_sub_words_loop: + $LD $ta0,0($a2) + subu $a3,4 + $LD $t1,$BNSZ($a1) + and $at,$a3,$minus4 + $LD $t2,2*$BNSZ($a1) + $PTR_ADD $a2,4*$BNSZ + $LD $t3,3*$BNSZ($a1) + $PTR_ADD $a0,4*$BNSZ + $LD $ta1,-3*$BNSZ($a2) + $PTR_ADD $a1,4*$BNSZ + $LD $ta2,-2*$BNSZ($a2) + $LD $ta3,-$BNSZ($a2) + sltu $t8,$t0,$ta0 + $SUBU $ta0,$t0,$ta0 + $SUBU $t0,$ta0,$v0 + sgtu $v0,$t0,$ta0 + $ST $t0,-4*$BNSZ($a0) + $ADDU $v0,$t8 + + sltu $t9,$t1,$ta1 + $SUBU $ta1,$t1,$ta1 + $SUBU $t1,$ta1,$v0 + sgtu $v0,$t1,$ta1 + $ST $t1,-3*$BNSZ($a0) + $ADDU $v0,$t9 + + + sltu $t8,$t2,$ta2 + $SUBU $ta2,$t2,$ta2 + $SUBU $t2,$ta2,$v0 + sgtu $v0,$t2,$ta2 + $ST $t2,-2*$BNSZ($a0) + $ADDU $v0,$t8 + + sltu $t9,$t3,$ta3 + $SUBU $ta3,$t3,$ta3 + $SUBU $t3,$ta3,$v0 + sgtu $v0,$t3,$ta3 + $ST $t3,-$BNSZ($a0) + $ADDU $v0,$t9 + + .set noreorder + bgtzl $at,.L_bn_sub_words_loop + $LD $t0,0($a1) + + beqz $a3,.L_bn_sub_words_return + nop + +.L_bn_sub_words_tail: + .set reorder + $LD $t0,0($a1) + $LD $ta0,0($a2) + subu $a3,1 + sltu $t8,$t0,$ta0 + $SUBU $ta0,$t0,$ta0 + $SUBU $t0,$ta0,$v0 + sgtu $v0,$t0,$ta0 + $ST $t0,0($a0) + $ADDU $v0,$t8 + beqz $a3,.L_bn_sub_words_return + + $LD $t1,$BNSZ($a1) + subu $a3,1 + $LD $ta1,$BNSZ($a2) + sltu $t9,$t1,$ta1 + $SUBU $ta1,$t1,$ta1 + $SUBU $t1,$ta1,$v0 + sgtu $v0,$t1,$ta1 + $ST $t1,$BNSZ($a0) + $ADDU $v0,$t9 + beqz $a3,.L_bn_sub_words_return + + $LD $t2,2*$BNSZ($a1) + $LD $ta2,2*$BNSZ($a2) + sltu $t8,$t2,$ta2 + $SUBU $ta2,$t2,$ta2 + $SUBU $t2,$ta2,$v0 + sgtu $v0,$t2,$ta2 + $ST $t2,2*$BNSZ($a0) + $ADDU $v0,$t8 + +.L_bn_sub_words_return: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_sub_words_internal + +.align 5 +.globl bn_div_3_words +.ent bn_div_3_words +bn_div_3_words: + .set noreorder + move $a3,$a0 # we know that bn_div_words does not + # touch $a3, $ta2, $ta3 and preserves $a2 + # so that we can save two arguments + # and return address in registers + # instead of stack:-) + + $LD $a0,($a3) + move $ta2,$a1 + bne $a0,$a2,bn_div_3_words_internal + $LD $a1,-$BNSZ($a3) + li $v0,-1 + jr $ra + move $a0,$v0 +.end bn_div_3_words + +.align 5 +.ent bn_div_3_words_internal +bn_div_3_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + move $ta3,$ra + bal bn_div_words + move $ra,$ta3 + $MULTU $ta2,$v0 + $LD $t2,-2*$BNSZ($a3) + move $ta0,$zero + mfhi $t1 + mflo $t0 + sltu $t8,$t1,$a1 +.L_bn_div_3_words_inner_loop: + bnez $t8,.L_bn_div_3_words_inner_loop_done + sgeu $at,$t2,$t0 + seq $t9,$t1,$a1 + and $at,$t9 + sltu $t3,$t0,$ta2 + $ADDU $a1,$a2 + $SUBU $t1,$t3 + $SUBU $t0,$ta2 + sltu $t8,$t1,$a1 + sltu $ta0,$a1,$a2 + or $t8,$ta0 + .set noreorder + beqzl $at,.L_bn_div_3_words_inner_loop + $SUBU $v0,1 + .set reorder +.L_bn_div_3_words_inner_loop_done: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_div_3_words_internal + +.align 5 +.globl bn_div_words +.ent bn_div_words +bn_div_words: + .set noreorder + bnez $a2,bn_div_words_internal + li $v0,-1 # I would rather signal div-by-zero + # which can be done with 'break 7' + jr $ra + move $a0,$v0 +.end bn_div_words + +.align 5 +.ent bn_div_words_internal +bn_div_words_internal: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + move $v1,$zero + bltz $a2,.L_bn_div_words_body + move $t9,$v1 + $SLL $a2,1 + bgtz $a2,.-4 + addu $t9,1 + + .set reorder + negu $t1,$t9 + li $t2,-1 + $SLL $t2,$t1 + and $t2,$a0 + $SRL $at,$a1,$t1 + .set noreorder + bnezl $t2,.+8 + break 6 # signal overflow + .set reorder + $SLL $a0,$t9 + $SLL $a1,$t9 + or $a0,$at +___ +$QT=$ta0; +$HH=$ta1; +$DH=$v1; +$code.=<<___; +.L_bn_div_words_body: + $SRL $DH,$a2,4*$BNSZ # bits + sgeu $at,$a0,$a2 + .set noreorder + bnezl $at,.+8 + $SUBU $a0,$a2 + .set reorder + + li $QT,-1 + $SRL $HH,$a0,4*$BNSZ # bits + $SRL $QT,4*$BNSZ # q=0xffffffff + beq $DH,$HH,.L_bn_div_words_skip_div1 + $DIVU $zero,$a0,$DH + mflo $QT +.L_bn_div_words_skip_div1: + $MULTU $a2,$QT + $SLL $t3,$a0,4*$BNSZ # bits + $SRL $at,$a1,4*$BNSZ # bits + or $t3,$at + mflo $t0 + mfhi $t1 +.L_bn_div_words_inner_loop1: + sltu $t2,$t3,$t0 + seq $t8,$HH,$t1 + sltu $at,$HH,$t1 + and $t2,$t8 + sltu $v0,$t0,$a2 + or $at,$t2 + .set noreorder + beqz $at,.L_bn_div_words_inner_loop1_done + $SUBU $t1,$v0 + $SUBU $t0,$a2 + b .L_bn_div_words_inner_loop1 + $SUBU $QT,1 + .set reorder +.L_bn_div_words_inner_loop1_done: + + $SLL $a1,4*$BNSZ # bits + $SUBU $a0,$t3,$t0 + $SLL $v0,$QT,4*$BNSZ # bits + + li $QT,-1 + $SRL $HH,$a0,4*$BNSZ # bits + $SRL $QT,4*$BNSZ # q=0xffffffff + beq $DH,$HH,.L_bn_div_words_skip_div2 + $DIVU $zero,$a0,$DH + mflo $QT +.L_bn_div_words_skip_div2: + $MULTU $a2,$QT + $SLL $t3,$a0,4*$BNSZ # bits + $SRL $at,$a1,4*$BNSZ # bits + or $t3,$at + mflo $t0 + mfhi $t1 +.L_bn_div_words_inner_loop2: + sltu $t2,$t3,$t0 + seq $t8,$HH,$t1 + sltu $at,$HH,$t1 + and $t2,$t8 + sltu $v1,$t0,$a2 + or $at,$t2 + .set noreorder + beqz $at,.L_bn_div_words_inner_loop2_done + $SUBU $t1,$v1 + $SUBU $t0,$a2 + b .L_bn_div_words_inner_loop2 + $SUBU $QT,1 + .set reorder +.L_bn_div_words_inner_loop2_done: + + $SUBU $a0,$t3,$t0 + or $v0,$QT + $SRL $v1,$a0,$t9 # $v1 contains remainder if anybody wants it + $SRL $a2,$t9 # restore $a2 + + .set noreorder + move $a1,$v1 +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + move $a0,$v0 +.end bn_div_words_internal +___ +undef $HH; undef $QT; undef $DH; + +($a_0,$a_1,$a_2,$a_3)=($t0,$t1,$t2,$t3); +($b_0,$b_1,$b_2,$b_3)=($ta0,$ta1,$ta2,$ta3); + +($a_4,$a_5,$a_6,$a_7)=($s0,$s2,$s4,$a1); # once we load a[7], no use for $a1 +($b_4,$b_5,$b_6,$b_7)=($s1,$s3,$s5,$a2); # once we load b[7], no use for $a2 + +($t_1,$t_2,$c_1,$c_2,$c_3)=($t8,$t9,$v0,$v1,$a3); + +$code.=<<___; + +.align 5 +.globl bn_mul_comba8 +.ent bn_mul_comba8 +bn_mul_comba8: + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,12*$SZREG,$ra + .mask 0x803ff008,-$SZREG + $PTR_SUB $sp,12*$SZREG + $REG_S $ra,11*$SZREG($sp) + $REG_S $s5,10*$SZREG($sp) + $REG_S $s4,9*$SZREG($sp) + $REG_S $s3,8*$SZREG($sp) + $REG_S $s2,7*$SZREG($sp) + $REG_S $s1,6*$SZREG($sp) + $REG_S $s0,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___ if ($flavour !~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x003f0000,-$SZREG + $PTR_SUB $sp,6*$SZREG + $REG_S $s5,5*$SZREG($sp) + $REG_S $s4,4*$SZREG($sp) + $REG_S $s3,3*$SZREG($sp) + $REG_S $s2,2*$SZREG($sp) + $REG_S $s1,1*$SZREG($sp) + $REG_S $s0,0*$SZREG($sp) +___ +$code.=<<___; + + .set reorder + $LD $a_0,0($a1) # If compiled with -mips3 option on + # R5000 box assembler barks on this + # 1ine with "should not have mult/div + # as last instruction in bb (R10K + # bug)" warning. If anybody out there + # has a clue about how to circumvent + # this do send me a note. + # + + $LD $b_0,0($a2) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_3,3*$BNSZ($a1) + $LD $b_1,$BNSZ($a2) + $LD $b_2,2*$BNSZ($a2) + $LD $b_3,3*$BNSZ($a2) + mflo $c_1 + mfhi $c_2 + + $LD $a_4,4*$BNSZ($a1) + $LD $a_5,5*$BNSZ($a1) + $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + $LD $a_6,6*$BNSZ($a1) + $LD $a_7,7*$BNSZ($a1) + $LD $b_4,4*$BNSZ($a2) + $LD $b_5,5*$BNSZ($a2) + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $ADDU $c_3,$t_2,$at + $LD $b_6,6*$BNSZ($a2) + $LD $b_7,7*$BNSZ($a2) + $ST $c_1,0($a0) # r[0]=c1; + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + $ST $c_2,$BNSZ($a0) # r[1]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) # r[2]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_0 # mul_add_c(a[4],b[0],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) # r[3]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$b_4 # mul_add_c(a[0],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$b_5 # mul_add_c(a[0],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) # r[4]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_4 # mul_add_c(a[1],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_4,$b_1 # mul_add_c(a[4],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_0 # mul_add_c(a[5],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_0 # mul_add_c(a[6],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) # r[5]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_1 # mul_add_c(a[5],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_2 # mul_add_c(a[4],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_4 # mul_add_c(a[2],b[4],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_5 # mul_add_c(a[1],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$b_6 # mul_add_c(a[0],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$b_7 # mul_add_c(a[0],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,6*$BNSZ($a0) # r[6]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_6 # mul_add_c(a[1],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_5 # mul_add_c(a[2],b[5],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_4 # mul_add_c(a[3],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_3 # mul_add_c(a[4],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_5,$b_2 # mul_add_c(a[5],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_6,$b_1 # mul_add_c(a[6],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_0 # mul_add_c(a[7],b[0],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_1 # mul_add_c(a[7],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,7*$BNSZ($a0) # r[7]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_2 # mul_add_c(a[6],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_3 # mul_add_c(a[5],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_4,$b_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_5 # mul_add_c(a[3],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_6 # mul_add_c(a[2],b[6],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_7 # mul_add_c(a[1],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$b_7 # mul_add_c(a[2],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,8*$BNSZ($a0) # r[8]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_6 # mul_add_c(a[3],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_4,$b_5 # mul_add_c(a[4],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_4 # mul_add_c(a[5],b[4],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_3 # mul_add_c(a[6],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_7,$b_2 # mul_add_c(a[7],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_7,$b_3 # mul_add_c(a[7],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,9*$BNSZ($a0) # r[9]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_6,$b_4 # mul_add_c(a[6],b[4],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_5,$b_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_6 # mul_add_c(a[4],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_3,$b_7 # mul_add_c(a[3],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$b_7 # mul_add_c(a[4],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,10*$BNSZ($a0) # r[10]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_5,$b_6 # mul_add_c(a[5],b[6],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_6,$b_5 # mul_add_c(a[6],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_7,$b_4 # mul_add_c(a[7],b[4],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_7,$b_5 # mul_add_c(a[7],b[5],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,11*$BNSZ($a0) # r[11]=c3; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_6 # mul_add_c(a[6],b[6],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_5,$b_7 # mul_add_c(a[5],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$b_7 # mul_add_c(a[6],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,12*$BNSZ($a0) # r[12]=c1; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_6 # mul_add_c(a[7],b[6],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_7,$b_7 # mul_add_c(a[7],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,13*$BNSZ($a0) # r[13]=c2; + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + $ST $c_3,14*$BNSZ($a0) # r[14]=c3; + $ST $c_1,15*$BNSZ($a0) # r[15]=c1; + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s5,10*$SZREG($sp) + $REG_L $s4,9*$SZREG($sp) + $REG_L $s3,8*$SZREG($sp) + $REG_L $s2,7*$SZREG($sp) + $REG_L $s1,6*$SZREG($sp) + $REG_L $s0,5*$SZREG($sp) + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + jr $ra + $PTR_ADD $sp,12*$SZREG +___ +$code.=<<___ if ($flavour !~ /nubi/i); + $REG_L $s5,5*$SZREG($sp) + $REG_L $s4,4*$SZREG($sp) + $REG_L $s3,3*$SZREG($sp) + $REG_L $s2,2*$SZREG($sp) + $REG_L $s1,1*$SZREG($sp) + $REG_L $s0,0*$SZREG($sp) + jr $ra + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; +.end bn_mul_comba8 + +.align 5 +.globl bn_mul_comba4 +.ent bn_mul_comba4 +bn_mul_comba4: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $b_0,0($a2) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $MULTU $a_0,$b_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_3,3*$BNSZ($a1) + $LD $b_1,$BNSZ($a2) + $LD $b_2,2*$BNSZ($a2) + $LD $b_3,3*$BNSZ($a2) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$b_1 # mul_add_c(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_0 # mul_add_c(a[1],b[0],c2,c3,c1); + $ADDU $c_3,$t_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_0 # mul_add_c(a[2],b[0],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_1,$b_1 # mul_add_c(a[1],b[1],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_2 # mul_add_c(a[0],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$b_3 # mul_add_c(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_1,$b_2 # mul_add_c(a[1],b[2],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $c_3,$c_2,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_2,$b_1 # mul_add_c(a[2],b[1],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_0 # mul_add_c(a[3],b[0],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_3,$b_1 # mul_add_c(a[3],b[1],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $c_1,$c_3,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_1,$b_3 # mul_add_c(a[1],b[3],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$b_3 # mul_add_c(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_2 # mul_add_c(a[3],b[2],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $c_2,$c_1,$t_2 + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_3,$b_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + $ST $c_1,6*$BNSZ($a0) + $ST $c_2,7*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_mul_comba4 +___ + +($a_4,$a_5,$a_6,$a_7)=($b_0,$b_1,$b_2,$b_3); + +$code.=<<___; + +.align 5 +.globl bn_sqr_comba8 +.ent bn_sqr_comba8 +bn_sqr_comba8: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $a_1,$BNSZ($a1) + $LD $a_2,2*$BNSZ($a1) + $LD $a_3,3*$BNSZ($a1) + + $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_4,4*$BNSZ($a1) + $LD $a_5,5*$BNSZ($a1) + $LD $a_6,6*$BNSZ($a1) + $LD $a_7,7*$BNSZ($a1) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $c_3,$t_2,$at + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_2 # mul_add_c2(a[1],b[2],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_0 # mul_add_c2(a[4],b[0],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_0,$a_5 # mul_add_c2(a[0],b[5],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_4 # mul_add_c2(a[1],b[4],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $MULTU $a_6,$a_0 # mul_add_c2(a[6],b[0],c1,c2,c3); + $ADDU $c_2,$at + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_5,$a_1 # mul_add_c2(a[5],b[1],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_2 # mul_add_c2(a[4],b[2],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_0,$a_7 # mul_add_c2(a[0],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,6*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_6 # mul_add_c2(a[1],b[6],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_2,$a_5 # mul_add_c2(a[2],b[5],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_3,$a_4 # mul_add_c2(a[3],b[4],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_7,$a_1 # mul_add_c2(a[7],b[1],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,7*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_2 # mul_add_c2(a[6],b[2],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_5,$a_3 # mul_add_c2(a[5],b[3],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_4,$a_4 # mul_add_c(a[4],b[4],c3,c1,c2); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_2,$a_7 # mul_add_c2(a[2],b[7],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,8*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_6 # mul_add_c2(a[3],b[6],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_4,$a_5 # mul_add_c2(a[4],b[5],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_7,$a_3 # mul_add_c2(a[7],b[3],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,9*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_4 # mul_add_c2(a[6],b[4],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_1,$at + $MULTU $a_5,$a_5 # mul_add_c(a[5],b[5],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_4,$a_7 # mul_add_c2(a[4],b[7],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,10*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_5,$a_6 # mul_add_c2(a[5],b[6],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_2,$at + $MULTU $a_7,$a_5 # mul_add_c2(a[7],b[5],c1,c2,c3); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,11*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_6,$a_6 # mul_add_c(a[6],b[6],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $MULTU $a_6,$a_7 # mul_add_c2(a[6],b[7],c2,c3,c1); + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,12*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_7,$a_7 # mul_add_c(a[7],b[7],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,13*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + $ST $c_3,14*$BNSZ($a0) + $ST $c_1,15*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_sqr_comba8 + +.align 5 +.globl bn_sqr_comba4 +.ent bn_sqr_comba4 +bn_sqr_comba4: +___ +$code.=<<___ if ($flavour =~ /nubi/i); + .frame $sp,6*$SZREG,$ra + .mask 0x8000f008,-$SZREG + .set noreorder + $PTR_SUB $sp,6*$SZREG + $REG_S $ra,5*$SZREG($sp) + $REG_S $t3,4*$SZREG($sp) + $REG_S $t2,3*$SZREG($sp) + $REG_S $t1,2*$SZREG($sp) + $REG_S $t0,1*$SZREG($sp) + $REG_S $gp,0*$SZREG($sp) +___ +$code.=<<___; + .set reorder + $LD $a_0,0($a1) + $LD $a_1,$BNSZ($a1) + $MULTU $a_0,$a_0 # mul_add_c(a[0],b[0],c1,c2,c3); + $LD $a_2,2*$BNSZ($a1) + $LD $a_3,3*$BNSZ($a1) + mflo $c_1 + mfhi $c_2 + $ST $c_1,0($a0) + + $MULTU $a_0,$a_1 # mul_add_c2(a[0],b[1],c2,c3,c1); + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_0 # mul_add_c2(a[2],b[0],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $c_3,$t_2,$at + $ST $c_2,$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_1 # mul_add_c(a[1],b[1],c3,c1,c2); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $MULTU $a_0,$a_3 # mul_add_c2(a[0],b[3],c1,c2,c3); + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,2*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_3,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_1,$a_2 # mul_add_c(a2[1],b[2],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + mflo $t_1 + mfhi $t_2 + slt $at,$t_2,$zero + $ADDU $c_3,$at + $MULTU $a_3,$a_1 # mul_add_c2(a[3],b[1],c2,c3,c1); + $SLL $t_2,1 + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + sltu $at,$c_2,$t_2 + $ADDU $c_3,$at + $ST $c_1,3*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_1,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_2,$a_2 # mul_add_c(a[2],b[2],c2,c3,c1); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + mflo $t_1 + mfhi $t_2 + $ADDU $c_2,$t_1 + sltu $at,$c_2,$t_1 + $MULTU $a_2,$a_3 # mul_add_c2(a[2],b[3],c3,c1,c2); + $ADDU $t_2,$at + $ADDU $c_3,$t_2 + sltu $at,$c_3,$t_2 + $ADDU $c_1,$at + $ST $c_2,4*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + slt $c_2,$t_2,$zero + $SLL $t_2,1 + $MULTU $a_3,$a_3 # mul_add_c(a[3],b[3],c1,c2,c3); + slt $a2,$t_1,$zero + $ADDU $t_2,$a2 + $SLL $t_1,1 + $ADDU $c_3,$t_1 + sltu $at,$c_3,$t_1 + $ADDU $t_2,$at + $ADDU $c_1,$t_2 + sltu $at,$c_1,$t_2 + $ADDU $c_2,$at + $ST $c_3,5*$BNSZ($a0) + + mflo $t_1 + mfhi $t_2 + $ADDU $c_1,$t_1 + sltu $at,$c_1,$t_1 + $ADDU $t_2,$at + $ADDU $c_2,$t_2 + $ST $c_1,6*$BNSZ($a0) + $ST $c_2,7*$BNSZ($a0) + + .set noreorder +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $t3,4*$SZREG($sp) + $REG_L $t2,3*$SZREG($sp) + $REG_L $t1,2*$SZREG($sp) + $REG_L $t0,1*$SZREG($sp) + $REG_L $gp,0*$SZREG($sp) + $PTR_ADD $sp,6*$SZREG +___ +$code.=<<___; + jr $ra + nop +.end bn_sqr_comba4 +___ +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl new file mode 100644 index 0000000000..54aeb01921 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/modexp512-x86_64.pl @@ -0,0 +1,1496 @@ +#!/usr/bin/env perl +# +# Copyright (c) 2010-2011 Intel Corp. +# Author: Vinodh.Gopal@intel.com +# Jim Guilford +# Erdinc.Ozturk@intel.com +# Maxim.Perminov@intel.com +# +# More information about algorithm used can be found at: +# http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf +# +# ==================================================================== +# Copyright (c) 2011 The OpenSSL Project. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# 3. All advertising materials mentioning features or use of this +# software must display the following acknowledgment: +# "This product includes software developed by the OpenSSL Project +# for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" +# +# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to +# endorse or promote products derived from this software without +# prior written permission. For written permission, please contact +# licensing@OpenSSL.org. +# +# 5. Products derived from this software may not be called "OpenSSL" +# nor may "OpenSSL" appear in their names without prior written +# permission of the OpenSSL Project. +# +# 6. Redistributions of any form whatsoever must retain the following +# acknowledgment: +# "This product includes software developed by the OpenSSL Project +# for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" +# +# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY +# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR +# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT +# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +# OF THE POSSIBILITY OF SUCH DAMAGE. +# ==================================================================== + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +use strict; +my $code=".text\n\n"; +my $m=0; + +# +# Define x512 macros +# + +#MULSTEP_512_ADD MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2 +# +# uses rax, rdx, and args +sub MULSTEP_512_ADD +{ + my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_; + my @X=@$x; # make a copy +$code.=<<___; + mov (+8*0)($SRC2), %rax + mul $OP # rdx:rax = %OP * [0] + mov ($ASRC), $X[0] + add %rax, $X[0] + adc \$0, %rdx + mov $X[0], $DST +___ +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov %rdx, $TMP + + mov (+8*$i)($SRC2), %rax + mul $OP # rdx:rax = %OP * [$i] + mov (+8*$i)($ASRC), $X[$i] + add %rax, $X[$i] + adc \$0, %rdx + add $TMP, $X[$i] + adc \$0, %rdx +___ +} +$code.=<<___; + mov %rdx, $X[0] +___ +} + +#MULSTEP_512 MACRO x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp +# +# uses rax, rdx, and args +sub MULSTEP_512 +{ + my ($x, $DST, $SRC2, $OP, $TMP)=@_; + my @X=@$x; # make a copy +$code.=<<___; + mov (+8*0)($SRC2), %rax + mul $OP # rdx:rax = %OP * [0] + add %rax, $X[0] + adc \$0, %rdx + mov $X[0], $DST +___ +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov %rdx, $TMP + + mov (+8*$i)($SRC2), %rax + mul $OP # rdx:rax = %OP * [$i] + add %rax, $X[$i] + adc \$0, %rdx + add $TMP, $X[$i] + adc \$0, %rdx +___ +} +$code.=<<___; + mov %rdx, $X[0] +___ +} + +# +# Swizzle Macros +# + +# macro to copy data from flat space to swizzled table +#MACRO swizzle pDst, pSrc, tmp1, tmp2 +# pDst and pSrc are modified +sub swizzle +{ + my ($pDst, $pSrc, $cnt, $d0)=@_; +$code.=<<___; + mov \$8, $cnt +loop_$m: + mov ($pSrc), $d0 + mov $d0#w, ($pDst) + shr \$16, $d0 + mov $d0#w, (+64*1)($pDst) + shr \$16, $d0 + mov $d0#w, (+64*2)($pDst) + shr \$16, $d0 + mov $d0#w, (+64*3)($pDst) + lea 8($pSrc), $pSrc + lea 64*4($pDst), $pDst + dec $cnt + jnz loop_$m +___ + + $m++; +} + +# macro to copy data from swizzled table to flat space +#MACRO unswizzle pDst, pSrc, tmp*3 +sub unswizzle +{ + my ($pDst, $pSrc, $cnt, $d0, $d1)=@_; +$code.=<<___; + mov \$4, $cnt +loop_$m: + movzxw (+64*3+256*0)($pSrc), $d0 + movzxw (+64*3+256*1)($pSrc), $d1 + shl \$16, $d0 + shl \$16, $d1 + mov (+64*2+256*0)($pSrc), $d0#w + mov (+64*2+256*1)($pSrc), $d1#w + shl \$16, $d0 + shl \$16, $d1 + mov (+64*1+256*0)($pSrc), $d0#w + mov (+64*1+256*1)($pSrc), $d1#w + shl \$16, $d0 + shl \$16, $d1 + mov (+64*0+256*0)($pSrc), $d0#w + mov (+64*0+256*1)($pSrc), $d1#w + mov $d0, (+8*0)($pDst) + mov $d1, (+8*1)($pDst) + lea 256*2($pSrc), $pSrc + lea 8*2($pDst), $pDst + sub \$1, $cnt + jnz loop_$m +___ + + $m++; +} + +# +# Data Structures +# + +# Reduce Data +# +# +# Offset Value +# 0C0 Carries +# 0B8 X2[10] +# 0B0 X2[9] +# 0A8 X2[8] +# 0A0 X2[7] +# 098 X2[6] +# 090 X2[5] +# 088 X2[4] +# 080 X2[3] +# 078 X2[2] +# 070 X2[1] +# 068 X2[0] +# 060 X1[12] P[10] +# 058 X1[11] P[9] Z[8] +# 050 X1[10] P[8] Z[7] +# 048 X1[9] P[7] Z[6] +# 040 X1[8] P[6] Z[5] +# 038 X1[7] P[5] Z[4] +# 030 X1[6] P[4] Z[3] +# 028 X1[5] P[3] Z[2] +# 020 X1[4] P[2] Z[1] +# 018 X1[3] P[1] Z[0] +# 010 X1[2] P[0] Y[2] +# 008 X1[1] Q[1] Y[1] +# 000 X1[0] Q[0] Y[0] + +my $X1_offset = 0; # 13 qwords +my $X2_offset = $X1_offset + 13*8; # 11 qwords +my $Carries_offset = $X2_offset + 11*8; # 1 qword +my $Q_offset = 0; # 2 qwords +my $P_offset = $Q_offset + 2*8; # 11 qwords +my $Y_offset = 0; # 3 qwords +my $Z_offset = $Y_offset + 3*8; # 9 qwords + +my $Red_Data_Size = $Carries_offset + 1*8; # (25 qwords) + +# +# Stack Frame +# +# +# offset value +# ... +# ... +# 280 Garray + +# 278 tmp16[15] +# ... ... +# 200 tmp16[0] + +# 1F8 tmp[7] +# ... ... +# 1C0 tmp[0] + +# 1B8 GT[7] +# ... ... +# 180 GT[0] + +# 178 Reduce Data +# ... ... +# 0B8 Reduce Data +# 0B0 reserved +# 0A8 reserved +# 0A0 reserved +# 098 reserved +# 090 reserved +# 088 reduce result addr +# 080 exp[8] + +# ... +# 048 exp[1] +# 040 exp[0] + +# 038 reserved +# 030 loop_idx +# 028 pg +# 020 i +# 018 pData ; arg 4 +# 010 pG ; arg 2 +# 008 pResult ; arg 1 +# 000 rsp ; stack pointer before subtract + +my $rsp_offset = 0; +my $pResult_offset = 8*1 + $rsp_offset; +my $pG_offset = 8*1 + $pResult_offset; +my $pData_offset = 8*1 + $pG_offset; +my $i_offset = 8*1 + $pData_offset; +my $pg_offset = 8*1 + $i_offset; +my $loop_idx_offset = 8*1 + $pg_offset; +my $reserved1_offset = 8*1 + $loop_idx_offset; +my $exp_offset = 8*1 + $reserved1_offset; +my $red_result_addr_offset= 8*9 + $exp_offset; +my $reserved2_offset = 8*1 + $red_result_addr_offset; +my $Reduce_Data_offset = 8*5 + $reserved2_offset; +my $GT_offset = $Red_Data_Size + $Reduce_Data_offset; +my $tmp_offset = 8*8 + $GT_offset; +my $tmp16_offset = 8*8 + $tmp_offset; +my $garray_offset = 8*16 + $tmp16_offset; +my $mem_size = 8*8*32 + $garray_offset; + +# +# Offsets within Reduce Data +# +# +# struct MODF_2FOLD_MONT_512_C1_DATA { +# UINT64 t[8][8]; +# UINT64 m[8]; +# UINT64 m1[8]; /* 2^768 % m */ +# UINT64 m2[8]; /* 2^640 % m */ +# UINT64 k1[2]; /* (- 1/m) % 2^128 */ +# }; + +my $T = 0; +my $M = 512; # = 8 * 8 * 8 +my $M1 = 576; # = 8 * 8 * 9 /* += 8 * 8 */ +my $M2 = 640; # = 8 * 8 * 10 /* += 8 * 8 */ +my $K1 = 704; # = 8 * 8 * 11 /* += 8 * 8 */ + +# +# FUNCTIONS +# + +{{{ +# +# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords) +# and add 512-bits (8 qwords) +# to get 640 bits (10 qwords) +# Input: 128-bit mul source: [rdi+8*1], rbp +# 512-bit mul source: [rsi+8*n] +# 512-bit add source: r15, r14, ..., r9, r8 +# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0] +# Clobbers all regs except: rcx, rsi, rdi +$code.=<<___; +.type MULADD_128x512,\@abi-omnipotent +.align 16 +MULADD_128x512: +___ + &MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx"); +$code.=<<___; + mov (+8*1)(%rdi), %rbp +___ + &MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx"); +$code.=<<___; + ret +.size MULADD_128x512,.-MULADD_128x512 +___ +}}} + +{{{ +#MULADD_256x512 MACRO pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0 +# +# Inputs: pDst: Destination (768 bits, 12 qwords) +# pA: Multiplicand (1024 bits, 16 qwords) +# pB: Multiplicand (512 bits, 8 qwords) +# Dst = Ah * B + Al +# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits) +# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0] +# Uses registers: arguments, RAX, RDX +sub MULADD_256x512 +{ + my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_; +$code.=<<___; + mov (+8*12)($pA), $OP +___ + &MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP); + push(@$X,shift(@$X)); + +$code.=<<___; + mov (+8*13)($pA), $OP +___ + &MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP); + push(@$X,shift(@$X)); + +$code.=<<___; + mov (+8*14)($pA), $OP +___ + &MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP); + push(@$X,shift(@$X)); + +$code.=<<___; + mov (+8*15)($pA), $OP +___ + &MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP); + push(@$X,shift(@$X)); +} + +# +# mont_reduce(UINT64 *x, /* 1024 bits, 16 qwords */ +# UINT64 *m, /* 512 bits, 8 qwords */ +# MODF_2FOLD_MONT_512_C1_DATA *data, +# UINT64 *r) /* 512 bits, 8 qwords */ +# Input: x (number to be reduced): tmp16 (Implicit) +# m (modulus): [pM] (Implicit) +# data (reduce data): [pData] (Implicit) +# Output: r (result): Address in [red_res_addr] +# result also in: r9, r8, r15, r14, r13, r12, r11, r10 + +my @X=map("%r$_",(8..15)); + +$code.=<<___; +.type mont_reduce,\@abi-omnipotent +.align 16 +mont_reduce: +___ + +my $STACK_DEPTH = 8; + # + # X1 = Xh * M1 + Xl +$code.=<<___; + lea (+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi # pX1 (Dst) 769 bits, 13 qwords + mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rsi # pM1 (Bsrc) 512 bits, 8 qwords + add \$$M1, %rsi + lea (+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx # X (Asrc) 1024 bits, 16 qwords + +___ + + &MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X); # rotates @X 4 times + # results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0] + +$code.=<<___; + xor %rax, %rax + # X1 += xl + add (+8*8)(%rcx), $X[4] + adc (+8*9)(%rcx), $X[5] + adc (+8*10)(%rcx), $X[6] + adc (+8*11)(%rcx), $X[7] + adc \$0, %rax + # X1 is now rax, r11-r8, r15-r12, tmp16[3:0] + + # + # check for carry ;; carry stored in rax + mov $X[4], (+8*8)(%rdi) # rdi points to X1 + mov $X[5], (+8*9)(%rdi) + mov $X[6], %rbp + mov $X[7], (+8*11)(%rdi) + + mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) + + mov (+8*0)(%rdi), $X[4] + mov (+8*1)(%rdi), $X[5] + mov (+8*2)(%rdi), $X[6] + mov (+8*3)(%rdi), $X[7] + + # X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8 + # rdi -> X1 + # rsi -> M1 + + # + # X2 = Xh * M2 + Xl + # do first part (X2 = Xh * M2) + add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords + # Xh is actually { [rdi+8*1], rbp } + add \$`$M2-$M1`, %rsi # rsi -> M2 + lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords +___ + unshift(@X,pop(@X)); unshift(@X,pop(@X)); +$code.=<<___; + + call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 + # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] + mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax + + # X2 += Xl + add (+8*8-8*10)(%rdi), $X[6] # (-8*10) is to adjust rdi -> Xh to Xl + adc (+8*9-8*10)(%rdi), $X[7] + mov $X[6], (+8*8)(%rcx) + mov $X[7], (+8*9)(%rcx) + + adc %rax, %rax + mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp) + + lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords + add \$`$K1-$M2`, %rsi # rsi -> pK1 ; 128 bits, 2 qwords + + # MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half) + # B1:B0 = rsi[1:0] = K1[1:0] + # A1:A0 = rcx[1:0] = X2[1:0] + # Result = rdi[1],rbp = Q[1],rbp + mov (%rsi), %r8 # B0 + mov (+8*1)(%rsi), %rbx # B1 + + mov (%rcx), %rax # A0 + mul %r8 # B0 + mov %rax, %rbp + mov %rdx, %r9 + + mov (+8*1)(%rcx), %rax # A1 + mul %r8 # B0 + add %rax, %r9 + + mov (%rcx), %rax # A0 + mul %rbx # B1 + add %rax, %r9 + + mov %r9, (+8*1)(%rdi) + # end MUL_128x128t128 + + sub \$`$K1-$M`, %rsi + + mov (%rcx), $X[6] + mov (+8*1)(%rcx), $X[7] # r9:r8 = X2[1:0] + + call MULADD_128x512 # args in rcx, rdi / rbp, rsi, r15-r8 + # result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0] + + # load first half of m to rdx, rdi, rbx, rax + # moved this here for efficiency + mov (+8*0)(%rsi), %rax + mov (+8*1)(%rsi), %rbx + mov (+8*2)(%rsi), %rdi + mov (+8*3)(%rsi), %rdx + + # continue with reduction + mov (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp + + add (+8*8)(%rcx), $X[6] + adc (+8*9)(%rcx), $X[7] + + #accumulate the final carry to rbp + adc %rbp, %rbp + + # Add in overflow corrections: R = (X2>>128) += T[overflow] + # R = {r9, r8, r15, r14, ..., r10} + shl \$3, %rbp + mov (+$pData_offset+$STACK_DEPTH)(%rsp), %rcx # rsi -> Data (and points to T) + add %rcx, %rbp # pT ; 512 bits, 8 qwords, spread out + + # rsi will be used to generate a mask after the addition + xor %rsi, %rsi + + add (+8*8*0)(%rbp), $X[0] + adc (+8*8*1)(%rbp), $X[1] + adc (+8*8*2)(%rbp), $X[2] + adc (+8*8*3)(%rbp), $X[3] + adc (+8*8*4)(%rbp), $X[4] + adc (+8*8*5)(%rbp), $X[5] + adc (+8*8*6)(%rbp), $X[6] + adc (+8*8*7)(%rbp), $X[7] + + # if there is a carry: rsi = 0xFFFFFFFFFFFFFFFF + # if carry is clear: rsi = 0x0000000000000000 + sbb \$0, %rsi + + # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m + and %rsi, %rax + and %rsi, %rbx + and %rsi, %rdi + and %rsi, %rdx + + mov \$1, %rbp + sub %rax, $X[0] + sbb %rbx, $X[1] + sbb %rdi, $X[2] + sbb %rdx, $X[3] + + # if there is a borrow: rbp = 0 + # if there is no borrow: rbp = 1 + # this is used to save the borrows in between the first half and the 2nd half of the subtraction of m + sbb \$0, %rbp + + #load second half of m to rdx, rdi, rbx, rax + + add \$$M, %rcx + mov (+8*4)(%rcx), %rax + mov (+8*5)(%rcx), %rbx + mov (+8*6)(%rcx), %rdi + mov (+8*7)(%rcx), %rdx + + # use the rsi mask as before + # if carry is clear, subtract 0. Otherwise, subtract 256 bits of m + and %rsi, %rax + and %rsi, %rbx + and %rsi, %rdi + and %rsi, %rdx + + # if rbp = 0, there was a borrow before, it is moved to the carry flag + # if rbp = 1, there was not a borrow before, carry flag is cleared + sub \$1, %rbp + + sbb %rax, $X[4] + sbb %rbx, $X[5] + sbb %rdi, $X[6] + sbb %rdx, $X[7] + + # write R back to memory + + mov (+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi + mov $X[0], (+8*0)(%rsi) + mov $X[1], (+8*1)(%rsi) + mov $X[2], (+8*2)(%rsi) + mov $X[3], (+8*3)(%rsi) + mov $X[4], (+8*4)(%rsi) + mov $X[5], (+8*5)(%rsi) + mov $X[6], (+8*6)(%rsi) + mov $X[7], (+8*7)(%rsi) + + ret +.size mont_reduce,.-mont_reduce +___ +}}} + +{{{ +#MUL_512x512 MACRO pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2 +# +# Inputs: pDst: Destination (1024 bits, 16 qwords) +# pA: Multiplicand (512 bits, 8 qwords) +# pB: Multiplicand (512 bits, 8 qwords) +# Uses registers rax, rdx, args +# B operand in [pB] and also in x7...x0 +sub MUL_512x512 +{ + my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_; + my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); + my @X=@$x; # make a copy + +$code.=<<___; + mov (+8*0)($pA), $OP + + mov $X[0], %rax + mul $OP # rdx:rax = %OP * [0] + mov %rax, (+$pDst_o+8*0)($pDst) + mov %rdx, $X[0] +___ +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov $X[$i], %rax + mul $OP # rdx:rax = %OP * [$i] + add %rax, $X[$i-1] + adc \$0, %rdx + mov %rdx, $X[$i] +___ +} + +for(my $i=1;$i<8;$i++) { +$code.=<<___; + mov (+8*$i)($pA), $OP +___ + + &MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP); + push(@X,shift(@X)); +} + +$code.=<<___; + mov $X[0], (+$pDst_o+8*8)($pDst) + mov $X[1], (+$pDst_o+8*9)($pDst) + mov $X[2], (+$pDst_o+8*10)($pDst) + mov $X[3], (+$pDst_o+8*11)($pDst) + mov $X[4], (+$pDst_o+8*12)($pDst) + mov $X[5], (+$pDst_o+8*13)($pDst) + mov $X[6], (+$pDst_o+8*14)($pDst) + mov $X[7], (+$pDst_o+8*15)($pDst) +___ +} + +# +# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits) +# Input: src1: Address of source 1: rdi +# src2: Address of source 2: rsi +# Output: dst: Address of destination: [red_res_addr] +# src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10 +# Temp: Clobbers [tmp16], all registers +$code.=<<___; +.type mont_mul_a3b,\@abi-omnipotent +.align 16 +mont_mul_a3b: + # + # multiply tmp = src1 * src2 + # For multiply: dst = rcx, src1 = rdi, src2 = rsi + # stack depth is extra 8 from call +___ + &MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx"); +$code.=<<___; + # + # Dst = tmp % m + # Call reduce(tmp, m, data, dst) + + # tail recursion optimization: jmp to mont_reduce and return from there + jmp mont_reduce + # call mont_reduce + # ret +.size mont_mul_a3b,.-mont_mul_a3b +___ +}}} + +{{{ +#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4 +# +# Input in memory [pA] and also in x7...x0 +# Uses all argument registers plus rax and rdx +# +# This version computes all of the off-diagonal terms into memory, +# and then it adds in the diagonal terms + +sub SQR_512 +{ + my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_; + my ($pDst, $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/); + my @X=@$x; # make a copy +$code.=<<___; + # ------------------ + # first pass 01...07 + # ------------------ + mov $X[0], $A + + mov $X[1],%rax + mul $A + mov %rax, (+$pDst_o+8*1)($pDst) +___ +for(my $i=2;$i<8;$i++) { +$code.=<<___; + mov %rdx, $X[$i-2] + mov $X[$i],%rax + mul $A + add %rax, $X[$i-2] + adc \$0, %rdx +___ +} +$code.=<<___; + mov %rdx, $x7 + + mov $X[0], (+$pDst_o+8*2)($pDst) + + # ------------------ + # second pass 12...17 + # ------------------ + + mov (+8*1)($pA), $A + + mov (+8*2)($pA),%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + mov $X[1], (+$pDst_o+8*3)($pDst) + + mov %rdx, $X[0] + mov (+8*3)($pA),%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + add $X[0], $X[2] + adc \$0, %rdx + mov $X[2], (+$pDst_o+8*4)($pDst) + + mov %rdx, $X[0] + mov (+8*4)($pA),%rax + mul $A + add %rax, $X[3] + adc \$0, %rdx + add $X[0], $X[3] + adc \$0, %rdx + + mov %rdx, $X[0] + mov (+8*5)($pA),%rax + mul $A + add %rax, $X[4] + adc \$0, %rdx + add $X[0], $X[4] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + add $X[0], $X[5] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $x7 + adc \$0, %rdx + add $X[0], $x7 + adc \$0, %rdx + + mov %rdx, $X[1] + + # ------------------ + # third pass 23...27 + # ------------------ + mov (+8*2)($pA), $A + + mov (+8*3)($pA),%rax + mul $A + add %rax, $X[3] + adc \$0, %rdx + mov $X[3], (+$pDst_o+8*5)($pDst) + + mov %rdx, $X[0] + mov (+8*4)($pA),%rax + mul $A + add %rax, $X[4] + adc \$0, %rdx + add $X[0], $X[4] + adc \$0, %rdx + mov $X[4], (+$pDst_o+8*6)($pDst) + + mov %rdx, $X[0] + mov (+8*5)($pA),%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + add $X[0], $X[5] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $x7 + adc \$0, %rdx + add $X[0], $x7 + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + add $X[0], $X[1] + adc \$0, %rdx + + mov %rdx, $X[2] + + # ------------------ + # fourth pass 34...37 + # ------------------ + + mov (+8*3)($pA), $A + + mov (+8*4)($pA),%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + mov $X[5], (+$pDst_o+8*7)($pDst) + + mov %rdx, $X[0] + mov (+8*5)($pA),%rax + mul $A + add %rax, $x7 + adc \$0, %rdx + add $X[0], $x7 + adc \$0, %rdx + mov $x7, (+$pDst_o+8*8)($pDst) + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + add $X[0], $X[1] + adc \$0, %rdx + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + add $X[0], $X[2] + adc \$0, %rdx + + mov %rdx, $X[5] + + # ------------------ + # fifth pass 45...47 + # ------------------ + mov (+8*4)($pA), $A + + mov (+8*5)($pA),%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + mov $X[1], (+$pDst_o+8*9)($pDst) + + mov %rdx, $X[0] + mov $X[6],%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + add $X[0], $X[2] + adc \$0, %rdx + mov $X[2], (+$pDst_o+8*10)($pDst) + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + add $X[0], $X[5] + adc \$0, %rdx + + mov %rdx, $X[1] + + # ------------------ + # sixth pass 56...57 + # ------------------ + mov (+8*5)($pA), $A + + mov $X[6],%rax + mul $A + add %rax, $X[5] + adc \$0, %rdx + mov $X[5], (+$pDst_o+8*11)($pDst) + + mov %rdx, $X[0] + mov $X[7],%rax + mul $A + add %rax, $X[1] + adc \$0, %rdx + add $X[0], $X[1] + adc \$0, %rdx + mov $X[1], (+$pDst_o+8*12)($pDst) + + mov %rdx, $X[2] + + # ------------------ + # seventh pass 67 + # ------------------ + mov $X[6], $A + + mov $X[7],%rax + mul $A + add %rax, $X[2] + adc \$0, %rdx + mov $X[2], (+$pDst_o+8*13)($pDst) + + mov %rdx, (+$pDst_o+8*14)($pDst) + + # start finalize (add in squares, and double off-terms) + mov (+$pDst_o+8*1)($pDst), $X[0] + mov (+$pDst_o+8*2)($pDst), $X[1] + mov (+$pDst_o+8*3)($pDst), $X[2] + mov (+$pDst_o+8*4)($pDst), $X[3] + mov (+$pDst_o+8*5)($pDst), $X[4] + mov (+$pDst_o+8*6)($pDst), $X[5] + + mov (+8*3)($pA), %rax + mul %rax + mov %rax, $x6 + mov %rdx, $X[6] + + add $X[0], $X[0] + adc $X[1], $X[1] + adc $X[2], $X[2] + adc $X[3], $X[3] + adc $X[4], $X[4] + adc $X[5], $X[5] + adc \$0, $X[6] + + mov (+8*0)($pA), %rax + mul %rax + mov %rax, (+$pDst_o+8*0)($pDst) + mov %rdx, $A + + mov (+8*1)($pA), %rax + mul %rax + + add $A, $X[0] + adc %rax, $X[1] + adc \$0, %rdx + + mov %rdx, $A + mov $X[0], (+$pDst_o+8*1)($pDst) + mov $X[1], (+$pDst_o+8*2)($pDst) + + mov (+8*2)($pA), %rax + mul %rax + + add $A, $X[2] + adc %rax, $X[3] + adc \$0, %rdx + + mov %rdx, $A + + mov $X[2], (+$pDst_o+8*3)($pDst) + mov $X[3], (+$pDst_o+8*4)($pDst) + + xor $tmp, $tmp + add $A, $X[4] + adc $x6, $X[5] + adc \$0, $tmp + + mov $X[4], (+$pDst_o+8*5)($pDst) + mov $X[5], (+$pDst_o+8*6)($pDst) + + # %%tmp has 0/1 in column 7 + # %%A6 has a full value in column 7 + + mov (+$pDst_o+8*7)($pDst), $X[0] + mov (+$pDst_o+8*8)($pDst), $X[1] + mov (+$pDst_o+8*9)($pDst), $X[2] + mov (+$pDst_o+8*10)($pDst), $X[3] + mov (+$pDst_o+8*11)($pDst), $X[4] + mov (+$pDst_o+8*12)($pDst), $X[5] + mov (+$pDst_o+8*13)($pDst), $x6 + mov (+$pDst_o+8*14)($pDst), $x7 + + mov $X[7], %rax + mul %rax + mov %rax, $X[7] + mov %rdx, $A + + add $X[0], $X[0] + adc $X[1], $X[1] + adc $X[2], $X[2] + adc $X[3], $X[3] + adc $X[4], $X[4] + adc $X[5], $X[5] + adc $x6, $x6 + adc $x7, $x7 + adc \$0, $A + + add $tmp, $X[0] + + mov (+8*4)($pA), %rax + mul %rax + + add $X[6], $X[0] + adc %rax, $X[1] + adc \$0, %rdx + + mov %rdx, $tmp + + mov $X[0], (+$pDst_o+8*7)($pDst) + mov $X[1], (+$pDst_o+8*8)($pDst) + + mov (+8*5)($pA), %rax + mul %rax + + add $tmp, $X[2] + adc %rax, $X[3] + adc \$0, %rdx + + mov %rdx, $tmp + + mov $X[2], (+$pDst_o+8*9)($pDst) + mov $X[3], (+$pDst_o+8*10)($pDst) + + mov (+8*6)($pA), %rax + mul %rax + + add $tmp, $X[4] + adc %rax, $X[5] + adc \$0, %rdx + + mov $X[4], (+$pDst_o+8*11)($pDst) + mov $X[5], (+$pDst_o+8*12)($pDst) + + add %rdx, $x6 + adc $X[7], $x7 + adc \$0, $A + + mov $x6, (+$pDst_o+8*13)($pDst) + mov $x7, (+$pDst_o+8*14)($pDst) + mov $A, (+$pDst_o+8*15)($pDst) +___ +} + +# +# sqr_reduce: subroutine to compute Result = reduce(Result * Result) +# +# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10 +# +$code.=<<___; +.type sqr_reduce,\@abi-omnipotent +.align 16 +sqr_reduce: + mov (+$pResult_offset+8)(%rsp), %rcx +___ + &SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi"); +$code.=<<___; + # tail recursion optimization: jmp to mont_reduce and return from there + jmp mont_reduce + # call mont_reduce + # ret +.size sqr_reduce,.-sqr_reduce +___ +}}} + +# +# MAIN FUNCTION +# + +#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */ +# UINT64 *g, /* 512 bits, 8 qwords */ +# UINT64 *exp, /* 512 bits, 8 qwords */ +# struct mod_ctx_512 *data) + +# window size = 5 +# table size = 2^5 = 32 +#table_entries equ 32 +#table_size equ table_entries * 8 +$code.=<<___; +.globl mod_exp_512 +.type mod_exp_512,\@function,4 +mod_exp_512: + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + + # adjust stack down and then align it with cache boundary + mov %rsp, %r8 + sub \$$mem_size, %rsp + and \$-64, %rsp + + # store previous stack pointer and arguments + mov %r8, (+$rsp_offset)(%rsp) + mov %rdi, (+$pResult_offset)(%rsp) + mov %rsi, (+$pG_offset)(%rsp) + mov %rcx, (+$pData_offset)(%rsp) +.Lbody: + # transform g into montgomery space + # GT = reduce(g * C2) = reduce(g * (2^256)) + # reduce expects to have the input in [tmp16] + pxor %xmm4, %xmm4 + movdqu (+16*0)(%rsi), %xmm0 + movdqu (+16*1)(%rsi), %xmm1 + movdqu (+16*2)(%rsi), %xmm2 + movdqu (+16*3)(%rsi), %xmm3 + movdqa %xmm4, (+$tmp16_offset+16*0)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*1)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) + movdqa %xmm0, (+$tmp16_offset+16*2)(%rsp) + movdqa %xmm1, (+$tmp16_offset+16*3)(%rsp) + movdqa %xmm2, (+$tmp16_offset+16*4)(%rsp) + movdqa %xmm3, (+$tmp16_offset+16*5)(%rsp) + + # load pExp before rdx gets blown away + movdqu (+16*0)(%rdx), %xmm0 + movdqu (+16*1)(%rdx), %xmm1 + movdqu (+16*2)(%rdx), %xmm2 + movdqu (+16*3)(%rdx), %xmm3 + + lea (+$GT_offset)(%rsp), %rbx + mov %rbx, (+$red_result_addr_offset)(%rsp) + call mont_reduce + + # Initialize tmp = C + lea (+$tmp_offset)(%rsp), %rcx + xor %rax, %rax + mov %rax, (+8*0)(%rcx) + mov %rax, (+8*1)(%rcx) + mov %rax, (+8*3)(%rcx) + mov %rax, (+8*4)(%rcx) + mov %rax, (+8*5)(%rcx) + mov %rax, (+8*6)(%rcx) + mov %rax, (+8*7)(%rcx) + mov %rax, (+$exp_offset+8*8)(%rsp) + movq \$1, (+8*2)(%rcx) + + lea (+$garray_offset)(%rsp), %rbp + mov %rcx, %rsi # pTmp + mov %rbp, %rdi # Garray[][0] +___ + + &swizzle("%rdi", "%rcx", "%rax", "%rbx"); + + # for (rax = 31; rax != 0; rax--) { + # tmp = reduce(tmp * G) + # swizzle(pg, tmp); + # pg += 2; } +$code.=<<___; + mov \$31, %rax + mov %rax, (+$i_offset)(%rsp) + mov %rbp, (+$pg_offset)(%rsp) + # rsi -> pTmp + mov %rsi, (+$red_result_addr_offset)(%rsp) + mov (+8*0)(%rsi), %r10 + mov (+8*1)(%rsi), %r11 + mov (+8*2)(%rsi), %r12 + mov (+8*3)(%rsi), %r13 + mov (+8*4)(%rsi), %r14 + mov (+8*5)(%rsi), %r15 + mov (+8*6)(%rsi), %r8 + mov (+8*7)(%rsi), %r9 +init_loop: + lea (+$GT_offset)(%rsp), %rdi + call mont_mul_a3b + lea (+$tmp_offset)(%rsp), %rsi + mov (+$pg_offset)(%rsp), %rbp + add \$2, %rbp + mov %rbp, (+$pg_offset)(%rsp) + mov %rsi, %rcx # rcx = rsi = addr of tmp +___ + + &swizzle("%rbp", "%rcx", "%rax", "%rbx"); +$code.=<<___; + mov (+$i_offset)(%rsp), %rax + sub \$1, %rax + mov %rax, (+$i_offset)(%rsp) + jne init_loop + + # + # Copy exponent onto stack + movdqa %xmm0, (+$exp_offset+16*0)(%rsp) + movdqa %xmm1, (+$exp_offset+16*1)(%rsp) + movdqa %xmm2, (+$exp_offset+16*2)(%rsp) + movdqa %xmm3, (+$exp_offset+16*3)(%rsp) + + + # + # Do exponentiation + # Initialize result to G[exp{511:507}] + mov (+$exp_offset+62)(%rsp), %eax + mov %rax, %rdx + shr \$11, %rax + and \$0x07FF, %edx + mov %edx, (+$exp_offset+62)(%rsp) + lea (+$garray_offset)(%rsp,%rax,2), %rsi + mov (+$pResult_offset)(%rsp), %rdx +___ + + &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); + + # + # Loop variables + # rcx = [loop_idx] = index: 510-5 to 0 by 5 +$code.=<<___; + movq \$505, (+$loop_idx_offset)(%rsp) + + mov (+$pResult_offset)(%rsp), %rcx + mov %rcx, (+$red_result_addr_offset)(%rsp) + mov (+8*0)(%rcx), %r10 + mov (+8*1)(%rcx), %r11 + mov (+8*2)(%rcx), %r12 + mov (+8*3)(%rcx), %r13 + mov (+8*4)(%rcx), %r14 + mov (+8*5)(%rcx), %r15 + mov (+8*6)(%rcx), %r8 + mov (+8*7)(%rcx), %r9 + jmp sqr_2 + +main_loop_a3b: + call sqr_reduce + call sqr_reduce + call sqr_reduce +sqr_2: + call sqr_reduce + call sqr_reduce + + # + # Do multiply, first look up proper value in Garray + mov (+$loop_idx_offset)(%rsp), %rcx # bit index + mov %rcx, %rax + shr \$4, %rax # rax is word pointer + mov (+$exp_offset)(%rsp,%rax,2), %edx + and \$15, %rcx + shrq %cl, %rdx + and \$0x1F, %rdx + + lea (+$garray_offset)(%rsp,%rdx,2), %rsi + lea (+$tmp_offset)(%rsp), %rdx + mov %rdx, %rdi +___ + + &unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax"); + # rdi = tmp = pG + + # + # Call mod_mul_a1(pDst, pSrc1, pSrc2, pM, pData) + # result result pG M Data +$code.=<<___; + mov (+$pResult_offset)(%rsp), %rsi + call mont_mul_a3b + + # + # finish loop + mov (+$loop_idx_offset)(%rsp), %rcx + sub \$5, %rcx + mov %rcx, (+$loop_idx_offset)(%rsp) + jge main_loop_a3b + + # + +end_main_loop_a3b: + # transform result out of Montgomery space + # result = reduce(result) + mov (+$pResult_offset)(%rsp), %rdx + pxor %xmm4, %xmm4 + movdqu (+16*0)(%rdx), %xmm0 + movdqu (+16*1)(%rdx), %xmm1 + movdqu (+16*2)(%rdx), %xmm2 + movdqu (+16*3)(%rdx), %xmm3 + movdqa %xmm4, (+$tmp16_offset+16*4)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*5)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*6)(%rsp) + movdqa %xmm4, (+$tmp16_offset+16*7)(%rsp) + movdqa %xmm0, (+$tmp16_offset+16*0)(%rsp) + movdqa %xmm1, (+$tmp16_offset+16*1)(%rsp) + movdqa %xmm2, (+$tmp16_offset+16*2)(%rsp) + movdqa %xmm3, (+$tmp16_offset+16*3)(%rsp) + call mont_reduce + + # If result > m, subract m + # load result into r15:r8 + mov (+$pResult_offset)(%rsp), %rax + mov (+8*0)(%rax), %r8 + mov (+8*1)(%rax), %r9 + mov (+8*2)(%rax), %r10 + mov (+8*3)(%rax), %r11 + mov (+8*4)(%rax), %r12 + mov (+8*5)(%rax), %r13 + mov (+8*6)(%rax), %r14 + mov (+8*7)(%rax), %r15 + + # subtract m + mov (+$pData_offset)(%rsp), %rbx + add \$$M, %rbx + + sub (+8*0)(%rbx), %r8 + sbb (+8*1)(%rbx), %r9 + sbb (+8*2)(%rbx), %r10 + sbb (+8*3)(%rbx), %r11 + sbb (+8*4)(%rbx), %r12 + sbb (+8*5)(%rbx), %r13 + sbb (+8*6)(%rbx), %r14 + sbb (+8*7)(%rbx), %r15 + + # if Carry is clear, replace result with difference + mov (+8*0)(%rax), %rsi + mov (+8*1)(%rax), %rdi + mov (+8*2)(%rax), %rcx + mov (+8*3)(%rax), %rdx + cmovnc %r8, %rsi + cmovnc %r9, %rdi + cmovnc %r10, %rcx + cmovnc %r11, %rdx + mov %rsi, (+8*0)(%rax) + mov %rdi, (+8*1)(%rax) + mov %rcx, (+8*2)(%rax) + mov %rdx, (+8*3)(%rax) + + mov (+8*4)(%rax), %rsi + mov (+8*5)(%rax), %rdi + mov (+8*6)(%rax), %rcx + mov (+8*7)(%rax), %rdx + cmovnc %r12, %rsi + cmovnc %r13, %rdi + cmovnc %r14, %rcx + cmovnc %r15, %rdx + mov %rsi, (+8*4)(%rax) + mov %rdi, (+8*5)(%rax) + mov %rcx, (+8*6)(%rax) + mov %rdx, (+8*7)(%rax) + + mov (+$rsp_offset)(%rsp), %rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbx + mov 40(%rsi),%rbp + lea 48(%rsi),%rsp +.Lepilogue: + ret +.size mod_exp_512, . - mod_exp_512 +___ + +if ($win64) { +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +my $rec="%rcx"; +my $frame="%rdx"; +my $context="%r8"; +my $disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type mod_exp_512_se_handler,\@abi-omnipotent +.align 16 +mod_exp_512_se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lbody(%rip),%r10 + cmp %r10,%rbx # context->RipRsp + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + mov $rsp_offset(%rax),%rax # pull saved Rsp + + mov 32(%rax),%rbx + mov 40(%rax),%rbp + mov 24(%rax),%r12 + mov 16(%rax),%r13 + mov 8(%rax),%r14 + mov 0(%rax),%r15 + lea 48(%rax),%rax + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size mod_exp_512_se_handler,.-mod_exp_512_se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_mod_exp_512 + .rva .LSEH_end_mod_exp_512 + .rva .LSEH_info_mod_exp_512 + +.section .xdata +.align 8 +.LSEH_info_mod_exp_512: + .byte 9,0,0,0 + .rva mod_exp_512_se_handler +___ +} + +sub reg_part { +my ($reg,$conv)=@_; + if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } + elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } + elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } + elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } + return $reg; +} + +$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/(\(\+[^)]+\))/eval $1/gem; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/parisc-mont.pl b/src/lib/libcrypto/bn/asm/parisc-mont.pl new file mode 100644 index 0000000000..4a766a87fb --- /dev/null +++ b/src/lib/libcrypto/bn/asm/parisc-mont.pl @@ -0,0 +1,993 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# On PA-7100LC this module performs ~90-50% better, less for longer +# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means +# that compiler utilized xmpyu instruction to perform 32x32=64-bit +# multiplication, which in turn means that "baseline" performance was +# optimal in respect to instruction set capabilities. Fair comparison +# with vendor compiler is problematic, because OpenSSL doesn't define +# BN_LLONG [presumably] for historical reasons, which drives compiler +# toward 4 times 16x16=32-bit multiplicatons [plus complementary +# shifts and additions] instead. This means that you should observe +# several times improvement over code generated by vendor compiler +# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual +# improvement coefficient was never collected on PA-7100LC, or any +# other 1.1 CPU, because I don't have access to such machine with +# vendor compiler. But to give you a taste, PA-RISC 1.1 code path +# reportedly outperformed code generated by cc +DA1.1 +O3 by factor +# of ~5x on PA-8600. +# +# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is +# reportedly ~2x faster than vendor compiler generated code [according +# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of +# this implementation is actually 32-bit one, in the sense that it +# operates on 32-bit values. But pa-risc2[W].s operates on arrays of +# 64-bit BN_LONGs... How do they interoperate then? No problem. This +# module picks halves of 64-bit values in reverse order and pretends +# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" +# 64-bit code such as pa-risc2[W].s then? Well, the thing is that +# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, +# i.e. there is no "wider" multiplication like on most other 64-bit +# platforms. This means that even being effectively 32-bit, this +# implementation performs "64-bit" computational task in same amount +# of arithmetic operations, most notably multiplications. It requires +# more memory references, most notably to tp[num], but this doesn't +# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC +# 2.0 code path, provides virtually same performance as pa-risc2[W].s: +# it's ~10% better for shortest key length and ~10% worse for longest +# one. +# +# In case it wasn't clear. The module has two distinct code paths: +# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit +# additions and 64-bit integer loads, not to mention specific +# instruction scheduling. In 64-bit build naturally only 2.0 code path +# is assembled. In 32-bit application context both code paths are +# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path +# is taken automatically. Also, in 32-bit build the module imposes +# couple of limitations: vector lengths has to be even and vector +# addresses has to be 64-bit aligned. Normally neither is a problem: +# most common key lengths are even and vectors are commonly malloc-ed, +# which ensures alignment. +# +# Special thanks to polarhome.com for providing HP-UX account on +# PA-RISC 1.1 machine, and to correspondent who chose to remain +# anonymous for testing the code on PA-RISC 2.0 machine. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; + +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; + $BN_SZ =$SIZE_T; +} else { + $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; + $BN_SZ =$SIZE_T; + if (open CONF,"<${dir}../../opensslconf.h") { + while() { + if (m/#\s*define\s+SIXTY_FOUR_BIT/) { + $BN_SZ=8; + $LEVEL="2.0"; + last; + } + } + close CONF; + } +} + +$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker + # [+ argument transfer] +$LOCALS=$FRAME-$FRAME_MARKER; +$FRAME+=32; # local variables + +$tp="%r31"; +$ti1="%r29"; +$ti0="%r28"; + +$rp="%r26"; +$ap="%r25"; +$bp="%r24"; +$np="%r23"; +$n0="%r22"; # passed through stack in 32-bit +$num="%r21"; # passed through stack in 32-bit +$idx="%r20"; +$arrsz="%r19"; + +$nm1="%r7"; +$nm0="%r6"; +$ab1="%r5"; +$ab0="%r4"; + +$fp="%r3"; +$hi1="%r2"; +$hi0="%r1"; + +$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s + +$fm0="%fr4"; $fti=$fm0; +$fbi="%fr5L"; +$fn0="%fr5R"; +$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; +$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .ALIGN 64 +bn_mul_mont + .PROC + .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + ldo -$FRAME(%sp),$fp +___ +$code.=<<___ if ($SIZE_T==4); + ldw `-$FRAME_MARKER-4`($fp),$n0 + ldw `-$FRAME_MARKER-8`($fp),$num + nop + nop ; alignment +___ +$code.=<<___ if ($BN_SZ==4); + comiclr,<= 6,$num,%r0 ; are vectors long enough? + b L\$abort + ldi 0,%r28 ; signal "unhandled" + add,ev %r0,$num,$num ; is $num even? + b L\$abort + nop + or $ap,$np,$ti1 + extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? + b L\$abort + nop + nop ; alignment + nop + + fldws 0($n0),${fn0} + fldws,ma 4($bp),${fbi} ; bp[0] +___ +$code.=<<___ if ($BN_SZ==8); + comib,> 3,$num,L\$abort ; are vectors long enough? + ldi 0,%r28 ; signal "unhandled" + addl $num,$num,$num ; I operate on 32-bit values + + fldws 4($n0),${fn0} ; only low part of n0 + fldws 4($bp),${fbi} ; bp[0] in flipped word order +___ +$code.=<<___; + fldds 0($ap),${fai} ; ap[0,1] + fldds 0($np),${fni} ; np[0,1] + + sh2addl $num,%r0,$arrsz + ldi 31,$hi0 + ldo 36($arrsz),$hi1 ; space for tp[num+1] + andcm $hi1,$hi0,$hi1 ; align + addl $hi1,%sp,%sp + $PUSH $fp,-$SIZE_T(%sp) + + ldo `$LOCALS+16`($fp),$xfer + ldo `$LOCALS+32+4`($fp),$tp + + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] + xmpyu ${fn0},${fab0}R,${fm0} + + addl $arrsz,$ap,$ap ; point at the end + addl $arrsz,$np,$np + subi 0,$arrsz,$idx ; j=0 + ldo 8($idx),$idx ; j++++ + + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + fstds ${fab1},0($xfer) + fstds ${fnm1},8($xfer) + flddx $idx($ap),${fai} ; ap[2,3] + flddx $idx($np),${fni} ; np[2,3] +___ +$code.=<<___ if ($BN_SZ==4); + mtctl $hi0,%cr11 ; $hi0 still holds 31 + extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 + b L\$parisc11 + nop +___ +$code.=<<___; # PA-RISC 2.0 code-path + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + + extrd,u $ab0,31,32,$hi0 + extrd,u $ab0,63,32,$ab0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + ldo 8($idx),$idx ; j++++ + addl $ab0,$nm0,$nm0 ; low part is discarded + extrd,u $nm0,31,32,$hi1 + +L\$1st + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,63,32,$ab1 + addl $hi1,$nm1,$nm1 + flddx $idx($ap),${fai} ; ap[j,j+1] + flddx $idx($np),${fni} ; np[j,j+1] + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + addl $hi0,$ab0,$ab0 + extrd,u $ab0,31,32,$hi0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + stw $nm1,-4($tp) ; tp[j-1] + addl $ab0,$nm0,$nm0 + stw,ma $nm0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$1st ; j++++ + extrd,u $nm0,31,32,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,63,32,$ab1 + addl $hi1,$nm1,$nm1 + ldd -16($xfer),$ab0 + addl $ab1,$nm1,$nm1 + ldd -8($xfer),$nm0 + extrd,u $nm1,31,32,$hi1 + + addl $hi0,$ab0,$ab0 + extrd,u $ab0,31,32,$hi0 + stw $nm1,-4($tp) ; tp[j-1] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + ldd 0($xfer),$ab1 + addl $ab0,$nm0,$nm0 + ldd,mb 8($xfer),$nm1 + extrd,u $nm0,31,32,$hi1 + stw,ma $nm0,8($tp) ; tp[j-1] + + ldo -1($num),$num ; i-- + subi 0,$arrsz,$idx ; j=0 +___ +$code.=<<___ if ($BN_SZ==4); + fldws,ma 4($bp),${fbi} ; bp[1] +___ +$code.=<<___ if ($BN_SZ==8); + fldws 0($bp),${fbi} ; bp[1] in flipped word order +___ +$code.=<<___; + flddx $idx($ap),${fai} ; ap[0,1] + flddx $idx($np),${fni} ; np[0,1] + fldws 8($xfer),${fti}R ; tp[0] + addl $hi0,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + stw $nm1,-4($tp) ; tp[j-1] + + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + addl $hi1,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + stw $hi0,0($tp) + stw $hi1,4($tp) + + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + xmpyu ${fn0},${fab0}R,${fm0} + ldo `$LOCALS+32+4`($fp),$tp +L\$outer + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) ; 33-bit value + fstds ${fnm0},-8($xfer) + flddx $idx($ap),${fai} ; ap[2] + flddx $idx($np),${fni} ; np[2] + ldo 8($idx),$idx ; j++++ + ldd -16($xfer),$ab0 ; 33-bit value + ldd -8($xfer),$nm0 + ldw 0($xfer),$hi0 ; high part + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + extrd,u $ab0,31,32,$ti0 ; carry bit + extrd,u $ab0,63,32,$ab0 + fstds ${fab1},0($xfer) + addl $ti0,$hi0,$hi0 ; account carry bit + fstds ${fnm1},8($xfer) + addl $ab0,$nm0,$nm0 ; low part is discarded + ldw 0($tp),$ti1 ; tp[1] + extrd,u $nm0,31,32,$hi1 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + +L\$inner + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ti1,$ti1 + addl $ti1,$ab1,$ab1 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + flddx $idx($ap),${fai} ; ap[j,j+1] + flddx $idx($np),${fni} ; np[j,j+1] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + ldw 4($tp),$ti0 ; tp[j] + stw $nm1,-4($tp) ; tp[j-1] + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldd -16($xfer),$ab0 + fstds ${fab0},-16($xfer) + addl $hi0,$ti0,$ti0 + addl $ti0,$ab0,$ab0 + ldd -8($xfer),$nm0 + fstds ${fnm0},-8($xfer) + extrd,u $ab0,31,32,$hi0 + extrd,u $nm1,31,32,$hi1 + ldw 8($tp),$ti1 ; tp[j] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + addl $ab0,$nm0,$nm0 + stw,ma $nm0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$inner ; j++++ + extrd,u $nm0,31,32,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldd 0($xfer),$ab1 + fstds ${fab1},0($xfer) + addl $hi0,$ti1,$ti1 + addl $ti1,$ab1,$ab1 + ldd 8($xfer),$nm1 + fstds ${fnm1},8($xfer) + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + ldw 4($tp),$ti0 ; tp[j] + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + ldd -16($xfer),$ab0 + ldd -8($xfer),$nm0 + extrd,u $nm1,31,32,$hi1 + + addl $hi0,$ab0,$ab0 + addl $ti0,$ab0,$ab0 + stw $nm1,-4($tp) ; tp[j-1] + extrd,u $ab0,31,32,$hi0 + ldw 8($tp),$ti1 ; tp[j] + extrd,u $ab0,63,32,$ab0 + addl $hi1,$nm0,$nm0 + ldd 0($xfer),$ab1 + addl $ab0,$nm0,$nm0 + ldd,mb 8($xfer),$nm1 + extrd,u $nm0,31,32,$hi1 + stw,ma $nm0,8($tp) ; tp[j-1] + + addib,= -1,$num,L\$outerdone ; i-- + subi 0,$arrsz,$idx ; j=0 +___ +$code.=<<___ if ($BN_SZ==4); + fldws,ma 4($bp),${fbi} ; bp[i] +___ +$code.=<<___ if ($BN_SZ==8); + ldi 12,$ti0 ; bp[i] in flipped word order + addl,ev %r0,$num,$num + ldi -4,$ti0 + addl $ti0,$bp,$bp + fldws 0($bp),${fbi} +___ +$code.=<<___; + flddx $idx($ap),${fai} ; ap[0] + addl $hi0,$ab1,$ab1 + flddx $idx($np),${fni} ; np[0] + fldws 8($xfer),${fti}R ; tp[0] + addl $ti1,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] + ldw 4($tp),$ti0 ; tp[j] + + addl $hi1,$nm1,$nm1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + stw $nm1,-4($tp) ; tp[j-1] + + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + addl $hi1,$hi0,$hi0 + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + addl $ti0,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + stw $hi0,0($tp) + stw $hi1,4($tp) + xmpyu ${fn0},${fab0}R,${fm0} + + b L\$outer + ldo `$LOCALS+32+4`($fp),$tp + +L\$outerdone + addl $hi0,$ab1,$ab1 + addl $ti1,$ab1,$ab1 + extrd,u $ab1,31,32,$hi0 + extrd,u $ab1,63,32,$ab1 + + ldw 4($tp),$ti0 ; tp[j] + + addl $hi1,$nm1,$nm1 + addl $ab1,$nm1,$nm1 + extrd,u $nm1,31,32,$hi1 + stw $nm1,-4($tp) ; tp[j-1] + + addl $hi1,$hi0,$hi0 + addl $ti0,$hi0,$hi0 + extrd,u $hi0,31,32,$hi1 + stw $hi0,0($tp) + stw $hi1,4($tp) + + ldo `$LOCALS+32`($fp),$tp + sub %r0,%r0,%r0 ; clear borrow +___ +$code.=<<___ if ($BN_SZ==4); + ldws,ma 4($tp),$ti0 + extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? + b L\$sub_pa11 + addl $tp,$arrsz,$tp +L\$sub + ldwx $idx($np),$hi0 + subb $ti0,$hi0,$hi1 + ldwx $idx($tp),$ti0 + addib,<> 4,$idx,L\$sub + stws,ma $hi1,4($rp) + + subb $ti0,%r0,$hi1 + ldo -4($tp),$tp +___ +$code.=<<___ if ($BN_SZ==8); + ldd,ma 8($tp),$ti0 +L\$sub + ldd $idx($np),$hi0 + shrpd $ti0,$ti0,32,$ti0 ; flip word order + std $ti0,-8($tp) ; save flipped value + sub,db $ti0,$hi0,$hi1 + ldd,ma 8($tp),$ti0 + addib,<> 8,$idx,L\$sub + std,ma $hi1,8($rp) + + extrd,u $ti0,31,32,$ti0 ; carry in flipped word order + sub,db $ti0,%r0,$hi1 + ldo -8($tp),$tp +___ +$code.=<<___; + and $tp,$hi1,$ap + andcm $rp,$hi1,$bp + or $ap,$bp,$np + + sub $rp,$arrsz,$rp ; rewind rp + subi 0,$arrsz,$idx + ldo `$LOCALS+32`($fp),$tp +L\$copy + ldd $idx($np),$hi0 + std,ma %r0,8($tp) + addib,<> 8,$idx,.-8 ; L\$copy + std,ma $hi0,8($rp) +___ + +if ($BN_SZ==4) { # PA-RISC 1.1 code-path +$ablo=$ab0; +$abhi=$ab1; +$nmlo0=$nm0; +$nmhi0=$nm1; +$nmlo1="%r9"; +$nmhi1="%r8"; + +$code.=<<___; + b L\$done + nop + + .ALIGN 8 +L\$parisc11 + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -12($xfer),$ablo + ldw -16($xfer),$hi0 + ldw -4($xfer),$nmlo0 + ldw -8($xfer),$nmhi0 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + + ldo 8($idx),$idx ; j++++ + add $ablo,$nmlo0,$nmlo0 ; discarded + addc %r0,$nmhi0,$hi1 + ldw 4($xfer),$ablo + ldw 0($xfer),$abhi + nop + +L\$1st_pa11 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] + flddx $idx($ap),${fai} ; ap[j,j+1] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + flddx $idx($np),${fni} ; np[j,j+1] + add $hi0,$ablo,$ablo + ldw 12($xfer),$nmlo1 + addc %r0,$abhi,$hi0 + ldw 8($xfer),$nmhi1 + add $ablo,$nmlo1,$nmlo1 + fstds ${fab1},0($xfer) + addc %r0,$nmhi1,$nmhi1 + fstds ${fnm1},8($xfer) + add $hi1,$nmlo1,$nmlo1 + ldw -12($xfer),$ablo + addc %r0,$nmhi1,$hi1 + ldw -16($xfer),$abhi + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] + ldw -4($xfer),$nmlo0 + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -8($xfer),$nmhi0 + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$hi0 + fstds ${fab0},-16($xfer) + add $ablo,$nmlo0,$nmlo0 + fstds ${fnm0},-8($xfer) + addc %r0,$nmhi0,$nmhi0 + ldw 0($xfer),$abhi + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + stws,ma $nmlo0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$1st_pa11 ; j++++ + addc %r0,$nmhi0,$hi1 + + ldw 8($xfer),$nmhi1 + ldw 12($xfer),$nmlo1 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + add $hi0,$ablo,$ablo + fstds ${fab1},0($xfer) + addc %r0,$abhi,$hi0 + fstds ${fnm1},8($xfer) + add $ablo,$nmlo1,$nmlo1 + ldw -16($xfer),$abhi + addc %r0,$nmhi1,$nmhi1 + ldw -12($xfer),$ablo + add $hi1,$nmlo1,$nmlo1 + ldw -8($xfer),$nmhi0 + addc %r0,$nmhi1,$hi1 + ldw -4($xfer),$nmlo0 + + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$hi0 + ldw 0($xfer),$abhi + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldws,mb 8($xfer),$nmhi1 + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$nmlo1 + addc %r0,$nmhi0,$hi1 + stws,ma $nmlo0,8($tp) ; tp[j-1] + + ldo -1($num),$num ; i-- + subi 0,$arrsz,$idx ; j=0 + + fldws,ma 4($bp),${fbi} ; bp[1] + flddx $idx($ap),${fai} ; ap[0,1] + flddx $idx($np),${fni} ; np[0,1] + fldws 8($xfer),${fti}R ; tp[0] + add $hi0,$ablo,$ablo + addc %r0,$abhi,$hi0 + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + stw $nmlo1,-4($tp) ; tp[j-1] + + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + stw $hi0,0($tp) + stw $hi1,4($tp) + + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + xmpyu ${fn0},${fab0}R,${fm0} + ldo `$LOCALS+32+4`($fp),$tp +L\$outer_pa11 + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m + fstds ${fab0},-16($xfer) ; 33-bit value + fstds ${fnm0},-8($xfer) + flddx $idx($ap),${fai} ; ap[2,3] + flddx $idx($np),${fni} ; np[2,3] + ldw -16($xfer),$abhi ; carry bit actually + ldo 8($idx),$idx ; j++++ + ldw -12($xfer),$ablo + ldw -8($xfer),$nmhi0 + ldw -4($xfer),$nmlo0 + ldw 0($xfer),$hi0 ; high part + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + fstds ${fab1},0($xfer) + addl $abhi,$hi0,$hi0 ; account carry bit + fstds ${fnm1},8($xfer) + add $ablo,$nmlo0,$nmlo0 ; discarded + ldw 0($tp),$ti1 ; tp[1] + addc %r0,$nmhi0,$hi1 + fstds ${fab0},-16($xfer) + fstds ${fnm0},-8($xfer) + ldw 4($xfer),$ablo + ldw 0($xfer),$abhi + +L\$inner_pa11 + xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] + flddx $idx($ap),${fai} ; ap[j,j+1] + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m + flddx $idx($np),${fni} ; np[j,j+1] + add $hi0,$ablo,$ablo + ldw 4($tp),$ti0 ; tp[j] + addc %r0,$abhi,$abhi + ldw 12($xfer),$nmlo1 + add $ti1,$ablo,$ablo + ldw 8($xfer),$nmhi1 + addc %r0,$abhi,$hi0 + fstds ${fab1},0($xfer) + add $ablo,$nmlo1,$nmlo1 + fstds ${fnm1},8($xfer) + addc %r0,$nmhi1,$nmhi1 + ldw -12($xfer),$ablo + add $hi1,$nmlo1,$nmlo1 + ldw -16($xfer),$abhi + addc %r0,$nmhi1,$hi1 + + xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] + ldw 8($tp),$ti1 ; tp[j] + xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m + ldw -4($xfer),$nmlo0 + add $hi0,$ablo,$ablo + ldw -8($xfer),$nmhi0 + addc %r0,$abhi,$abhi + stw $nmlo1,-4($tp) ; tp[j-1] + add $ti0,$ablo,$ablo + fstds ${fab0},-16($xfer) + addc %r0,$abhi,$hi0 + fstds ${fnm0},-8($xfer) + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldw 0($xfer),$abhi + add $hi1,$nmlo0,$nmlo0 + stws,ma $nmlo0,8($tp) ; tp[j-1] + addib,<> 8,$idx,L\$inner_pa11 ; j++++ + addc %r0,$nmhi0,$hi1 + + xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] + ldw 12($xfer),$nmlo1 + xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m + ldw 8($xfer),$nmhi1 + add $hi0,$ablo,$ablo + ldw 4($tp),$ti0 ; tp[j] + addc %r0,$abhi,$abhi + fstds ${fab1},0($xfer) + add $ti1,$ablo,$ablo + fstds ${fnm1},8($xfer) + addc %r0,$abhi,$hi0 + ldw -16($xfer),$abhi + add $ablo,$nmlo1,$nmlo1 + ldw -12($xfer),$ablo + addc %r0,$nmhi1,$nmhi1 + ldw -8($xfer),$nmhi0 + add $hi1,$nmlo1,$nmlo1 + ldw -4($xfer),$nmlo0 + addc %r0,$nmhi1,$hi1 + + add $hi0,$ablo,$ablo + stw $nmlo1,-4($tp) ; tp[j-1] + addc %r0,$abhi,$abhi + add $ti0,$ablo,$ablo + ldw 8($tp),$ti1 ; tp[j] + addc %r0,$abhi,$hi0 + ldw 0($xfer),$abhi + add $ablo,$nmlo0,$nmlo0 + ldw 4($xfer),$ablo + addc %r0,$nmhi0,$nmhi0 + ldws,mb 8($xfer),$nmhi1 + add $hi1,$nmlo0,$nmlo0 + ldw 4($xfer),$nmlo1 + addc %r0,$nmhi0,$hi1 + stws,ma $nmlo0,8($tp) ; tp[j-1] + + addib,= -1,$num,L\$outerdone_pa11; i-- + subi 0,$arrsz,$idx ; j=0 + + fldws,ma 4($bp),${fbi} ; bp[i] + flddx $idx($ap),${fai} ; ap[0] + add $hi0,$ablo,$ablo + addc %r0,$abhi,$abhi + flddx $idx($np),${fni} ; np[0] + fldws 8($xfer),${fti}R ; tp[0] + add $ti1,$ablo,$ablo + addc %r0,$abhi,$hi0 + + ldo 8($idx),$idx ; j++++ + xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] + xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] + ldw 4($tp),$ti0 ; tp[j] + + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + fstws,mb ${fab0}L,-8($xfer) ; save high part + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + fcpy,sgl %fr0,${fti}L ; zero high part + fcpy,sgl %fr0,${fab0}L + stw $nmlo1,-4($tp) ; tp[j-1] + + fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double + fcnvxf,dbl,dbl ${fab0},${fab0} + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] + add $ti0,$hi0,$hi0 + addc %r0,$hi1,$hi1 + fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int + stw $hi0,0($tp) + stw $hi1,4($tp) + xmpyu ${fn0},${fab0}R,${fm0} + + b L\$outer_pa11 + ldo `$LOCALS+32+4`($fp),$tp + +L\$outerdone_pa11 + add $hi0,$ablo,$ablo + addc %r0,$abhi,$abhi + add $ti1,$ablo,$ablo + addc %r0,$abhi,$hi0 + + ldw 4($tp),$ti0 ; tp[j] + + add $hi1,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$nmhi1 + add $ablo,$nmlo1,$nmlo1 + addc %r0,$nmhi1,$hi1 + stw $nmlo1,-4($tp) ; tp[j-1] + + add $hi1,$hi0,$hi0 + addc %r0,%r0,$hi1 + add $ti0,$hi0,$hi0 + addc %r0,$hi1,$hi1 + stw $hi0,0($tp) + stw $hi1,4($tp) + + ldo `$LOCALS+32+4`($fp),$tp + sub %r0,%r0,%r0 ; clear borrow + ldw -4($tp),$ti0 + addl $tp,$arrsz,$tp +L\$sub_pa11 + ldwx $idx($np),$hi0 + subb $ti0,$hi0,$hi1 + ldwx $idx($tp),$ti0 + addib,<> 4,$idx,L\$sub_pa11 + stws,ma $hi1,4($rp) + + subb $ti0,%r0,$hi1 + ldo -4($tp),$tp + and $tp,$hi1,$ap + andcm $rp,$hi1,$bp + or $ap,$bp,$np + + sub $rp,$arrsz,$rp ; rewind rp + subi 0,$arrsz,$idx + ldo `$LOCALS+32`($fp),$tp +L\$copy_pa11 + ldwx $idx($np),$hi0 + stws,ma %r0,4($tp) + addib,<> 4,$idx,L\$copy_pa11 + stws,ma $hi0,4($rp) + + nop ; alignment +L\$done +___ +} + +$code.=<<___; + ldi 1,%r28 ; signal "handled" + ldo $FRAME($fp),%sp ; destroy tp[num+1] + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by " +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 + { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 + { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; + $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 + { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); + $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $sub = sub { + my ($mod,$args) = @_; + my $orig = "sub$mod\t$args"; + + if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { + my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; + $opcode|=(1<<10); # e1 + $opcode|=(1<<8); # e2 + $opcode|=(1<<5); # d + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + # flip word order in 64-bit mode... + s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); + # assemble 2.0 instructions in 32-bit mode... + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); + + print $_,"\n"; +} +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/ppc-mont.pl b/src/lib/libcrypto/bn/asm/ppc-mont.pl index 7849eae959..f9b6992ccc 100644 --- a/src/lib/libcrypto/bn/asm/ppc-mont.pl +++ b/src/lib/libcrypto/bn/asm/ppc-mont.pl @@ -31,7 +31,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=4; $RZONE= 224; - $FRAME= $SIZE_T*16; $LD= "lwz"; # load $LDU= "lwzu"; # load and update @@ -51,7 +50,6 @@ if ($flavour =~ /32/) { $BNSZ= $BITS/8; $SIZE_T=8; $RZONE= 288; - $FRAME= $SIZE_T*16; # same as above, but 64-bit mnemonics... $LD= "ld"; # load @@ -69,6 +67,9 @@ if ($flavour =~ /32/) { $POP= $LD; } else { die "nonsense $flavour"; } +$FRAME=8*$SIZE_T+$RZONE; +$LOCALS=8*$SIZE_T; + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or @@ -89,18 +90,18 @@ $aj="r10"; $nj="r11"; $tj="r12"; # non-volatile registers -$i="r14"; -$j="r15"; -$tp="r16"; -$m0="r17"; -$m1="r18"; -$lo0="r19"; -$hi0="r20"; -$lo1="r21"; -$hi1="r22"; -$alo="r23"; -$ahi="r24"; -$nlo="r25"; +$i="r20"; +$j="r21"; +$tp="r22"; +$m0="r23"; +$m1="r24"; +$lo0="r25"; +$hi0="r26"; +$lo1="r27"; +$hi1="r28"; +$alo="r29"; +$ahi="r30"; +$nlo="r31"; # $nhi="r0"; @@ -108,42 +109,48 @@ $code=<<___; .machine "any" .text -.globl .bn_mul_mont +.globl .bn_mul_mont_int .align 4 -.bn_mul_mont: +.bn_mul_mont_int: cmpwi $num,4 mr $rp,r3 ; $rp is reassigned li r3,0 bltlr - +___ +$code.=<<___ if ($BNSZ==4); + cmpwi $num,32 ; longer key performance is not better + bgelr +___ +$code.=<<___; slwi $num,$num,`log($BNSZ)/log(2)` li $tj,-4096 - addi $ovf,$num,`$FRAME+$RZONE` + addi $ovf,$num,$FRAME subf $ovf,$ovf,$sp ; $sp-$ovf and $ovf,$ovf,$tj ; minimize TLB usage subf $ovf,$sp,$ovf ; $ovf-$sp + mr $tj,$sp srwi $num,$num,`log($BNSZ)/log(2)` $STUX $sp,$sp,$ovf - $PUSH r14,`4*$SIZE_T`($sp) - $PUSH r15,`5*$SIZE_T`($sp) - $PUSH r16,`6*$SIZE_T`($sp) - $PUSH r17,`7*$SIZE_T`($sp) - $PUSH r18,`8*$SIZE_T`($sp) - $PUSH r19,`9*$SIZE_T`($sp) - $PUSH r20,`10*$SIZE_T`($sp) - $PUSH r21,`11*$SIZE_T`($sp) - $PUSH r22,`12*$SIZE_T`($sp) - $PUSH r23,`13*$SIZE_T`($sp) - $PUSH r24,`14*$SIZE_T`($sp) - $PUSH r25,`15*$SIZE_T`($sp) + $PUSH r20,`-12*$SIZE_T`($tj) + $PUSH r21,`-11*$SIZE_T`($tj) + $PUSH r22,`-10*$SIZE_T`($tj) + $PUSH r23,`-9*$SIZE_T`($tj) + $PUSH r24,`-8*$SIZE_T`($tj) + $PUSH r25,`-7*$SIZE_T`($tj) + $PUSH r26,`-6*$SIZE_T`($tj) + $PUSH r27,`-5*$SIZE_T`($tj) + $PUSH r28,`-4*$SIZE_T`($tj) + $PUSH r29,`-3*$SIZE_T`($tj) + $PUSH r30,`-2*$SIZE_T`($tj) + $PUSH r31,`-1*$SIZE_T`($tj) $LD $n0,0($n0) ; pull n0[0] value addi $num,$num,-2 ; adjust $num for counter register $LD $m0,0($bp) ; m0=bp[0] $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME + addi $tp,$sp,$LOCALS $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0] $UMULH $hi0,$aj,$m0 @@ -205,8 +212,8 @@ L1st: Louter: $LDX $m0,$bp,$i ; m0=bp[i] $LD $aj,0($ap) ; ap[0] - addi $tp,$sp,$FRAME - $LD $tj,$FRAME($sp) ; tp[0] + addi $tp,$sp,$LOCALS + $LD $tj,$LOCALS($sp); tp[0] $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i] $UMULH $hi0,$aj,$m0 $LD $aj,$BNSZ($ap) ; ap[1] @@ -273,7 +280,7 @@ Linner: addi $num,$num,2 ; restore $num subfc $j,$j,$j ; j=0 and "clear" XER[CA] - addi $tp,$sp,$FRAME + addi $tp,$sp,$LOCALS mtctr $num .align 4 @@ -299,23 +306,27 @@ Lcopy: ; copy or in-place refresh addi $j,$j,$BNSZ bdnz- Lcopy - $POP r14,`4*$SIZE_T`($sp) - $POP r15,`5*$SIZE_T`($sp) - $POP r16,`6*$SIZE_T`($sp) - $POP r17,`7*$SIZE_T`($sp) - $POP r18,`8*$SIZE_T`($sp) - $POP r19,`9*$SIZE_T`($sp) - $POP r20,`10*$SIZE_T`($sp) - $POP r21,`11*$SIZE_T`($sp) - $POP r22,`12*$SIZE_T`($sp) - $POP r23,`13*$SIZE_T`($sp) - $POP r24,`14*$SIZE_T`($sp) - $POP r25,`15*$SIZE_T`($sp) - $POP $sp,0($sp) + $POP $tj,0($sp) li r3,1 + $POP r20,`-12*$SIZE_T`($tj) + $POP r21,`-11*$SIZE_T`($tj) + $POP r22,`-10*$SIZE_T`($tj) + $POP r23,`-9*$SIZE_T`($tj) + $POP r24,`-8*$SIZE_T`($tj) + $POP r25,`-7*$SIZE_T`($tj) + $POP r26,`-6*$SIZE_T`($tj) + $POP r27,`-5*$SIZE_T`($tj) + $POP r28,`-4*$SIZE_T`($tj) + $POP r29,`-3*$SIZE_T`($tj) + $POP r30,`-2*$SIZE_T`($tj) + $POP r31,`-1*$SIZE_T`($tj) + mr $sp,$tj blr .long 0 -.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by " + .byte 0,12,4,0,0x80,12,6,0 + .long 0 + +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/src/lib/libcrypto/bn/asm/ppc.pl b/src/lib/libcrypto/bn/asm/ppc.pl index f4093177e6..1249ce2299 100644 --- a/src/lib/libcrypto/bn/asm/ppc.pl +++ b/src/lib/libcrypto/bn/asm/ppc.pl @@ -389,7 +389,9 @@ $data=<+-------------------------------+ # | saved sp | # +-------------------------------+ -# | | -# +-------------------------------+ -# | 10 saved gpr, r14-r23 | -# . . -# . . -# +12*size_t +-------------------------------+ -# | 12 saved fpr, f14-f25 | # . . -# . . -# +12*8 +-------------------------------+ -# | padding to 64 byte boundary | -# . . -# +X +-------------------------------+ +# +64 +-------------------------------+ # | 16 gpr<->fpr transfer zone | # . . # . . @@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25"; # . . # . . # +-------------------------------+ +# . . +# -12*size_t +-------------------------------+ +# | 10 saved gpr, r22-r31 | +# . . +# . . +# -12*8 +-------------------------------+ +# | 12 saved fpr, f20-f31 | +# . . +# . . +# +-------------------------------+ $code=<<___; .machine "any" @@ -181,14 +197,14 @@ $code=<<___; .globl .$fname .align 5 .$fname: - cmpwi $num,4 + cmpwi $num,`3*8/$SIZE_T` mr $rp,r3 ; $rp is reassigned li r3,0 ; possible "not handled" return code bltlr- - andi. r0,$num,1 ; $num has to be even + andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even" bnelr- - slwi $num,$num,3 ; num*=8 + slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG) li $i,-4096 slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num add $tp,$tp,$num ; place for tp[num+1] @@ -196,35 +212,50 @@ $code=<<___; subf $tp,$tp,$sp ; $sp-$tp and $tp,$tp,$i ; minimize TLB usage subf $tp,$sp,$tp ; $tp-$sp + mr $i,$sp $STUX $sp,$sp,$tp ; alloca - $PUSH r14,`2*$SIZE_T`($sp) - $PUSH r15,`3*$SIZE_T`($sp) - $PUSH r16,`4*$SIZE_T`($sp) - $PUSH r17,`5*$SIZE_T`($sp) - $PUSH r18,`6*$SIZE_T`($sp) - $PUSH r19,`7*$SIZE_T`($sp) - $PUSH r20,`8*$SIZE_T`($sp) - $PUSH r21,`9*$SIZE_T`($sp) - $PUSH r22,`10*$SIZE_T`($sp) - $PUSH r23,`11*$SIZE_T`($sp) - stfd f14,`12*$SIZE_T+0`($sp) - stfd f15,`12*$SIZE_T+8`($sp) - stfd f16,`12*$SIZE_T+16`($sp) - stfd f17,`12*$SIZE_T+24`($sp) - stfd f18,`12*$SIZE_T+32`($sp) - stfd f19,`12*$SIZE_T+40`($sp) - stfd f20,`12*$SIZE_T+48`($sp) - stfd f21,`12*$SIZE_T+56`($sp) - stfd f22,`12*$SIZE_T+64`($sp) - stfd f23,`12*$SIZE_T+72`($sp) - stfd f24,`12*$SIZE_T+80`($sp) - stfd f25,`12*$SIZE_T+88`($sp) - + $PUSH r22,`-12*8-10*$SIZE_T`($i) + $PUSH r23,`-12*8-9*$SIZE_T`($i) + $PUSH r24,`-12*8-8*$SIZE_T`($i) + $PUSH r25,`-12*8-7*$SIZE_T`($i) + $PUSH r26,`-12*8-6*$SIZE_T`($i) + $PUSH r27,`-12*8-5*$SIZE_T`($i) + $PUSH r28,`-12*8-4*$SIZE_T`($i) + $PUSH r29,`-12*8-3*$SIZE_T`($i) + $PUSH r30,`-12*8-2*$SIZE_T`($i) + $PUSH r31,`-12*8-1*$SIZE_T`($i) + stfd f20,`-12*8`($i) + stfd f21,`-11*8`($i) + stfd f22,`-10*8`($i) + stfd f23,`-9*8`($i) + stfd f24,`-8*8`($i) + stfd f25,`-7*8`($i) + stfd f26,`-6*8`($i) + stfd f27,`-5*8`($i) + stfd f28,`-4*8`($i) + stfd f29,`-3*8`($i) + stfd f30,`-2*8`($i) + stfd f31,`-1*8`($i) +___ +$code.=<<___ if ($SIZE_T==8); ld $a0,0($ap) ; pull ap[0] value ld $n0,0($n0) ; pull n0[0] value ld $t3,0($bp) ; bp[0] - +___ +$code.=<<___ if ($SIZE_T==4); + mr $t1,$n0 + lwz $a0,0($ap) ; pull ap[0,1] value + lwz $t0,4($ap) + lwz $n0,0($t1) ; pull n0[0,1] value + lwz $t1,4($t1) + lwz $t3,0($bp) ; bp[0,1] + lwz $t2,4($bp) + insrdi $a0,$t0,32,0 + insrdi $n0,$t1,32,0 + insrdi $t3,$t2,32,0 +___ +$code.=<<___; addi $tp,$sp,`$FRAME+$TRANSFER+8+64` li $i,-64 add $nap_d,$tp,$num @@ -258,6 +289,8 @@ $code=<<___; std $t5,`$FRAME+40`($sp) std $t6,`$FRAME+48`($sp) std $t7,`$FRAME+56`($sp) +___ +$code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair @@ -266,6 +299,18 @@ $code=<<___; lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) +___ +$code.=<<___ if ($SIZE_T==4); + lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs + lwz $t1,4($ap) + lwz $t2,8($ap) + lwz $t3,12($ap) + lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs + lwz $t5,4($np) + lwz $t6,8($np) + lwz $t7,12($np) +___ +$code.=<<___; lfd $ba,`$FRAME+0`($sp) lfd $bb,`$FRAME+8`($sp) lfd $bc,`$FRAME+16`($sp) @@ -374,6 +419,8 @@ $code=<<___; .align 5 L1st: +___ +$code.=<<___ if ($SIZE_T==8); lwz $t0,4($ap) ; load a[j] as 32-bit word pair lwz $t1,0($ap) lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair @@ -382,6 +429,18 @@ L1st: lwz $t5,0($np) lwz $t6,12($np) ; load n[j+1] as 32-bit word pair lwz $t7,8($np) +___ +$code.=<<___ if ($SIZE_T==4); + lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs + lwz $t1,4($ap) + lwz $t2,8($ap) + lwz $t3,12($ap) + lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs + lwz $t5,4($np) + lwz $t6,8($np) + lwz $t7,12($np) +___ +$code.=<<___; std $t0,`$FRAME+64`($sp) std $t1,`$FRAME+72`($sp) std $t2,`$FRAME+80`($sp) @@ -559,7 +618,17 @@ L1st: li $i,8 ; i=1 .align 5 Louter: +___ +$code.=<<___ if ($SIZE_T==8); ldx $t3,$bp,$i ; bp[i] +___ +$code.=<<___ if ($SIZE_T==4); + add $t0,$bp,$i + lwz $t3,0($t0) ; bp[i,i+1] + lwz $t0,4($t0) + insrdi $t3,$t0,32,0 +___ +$code.=<<___; ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0] mulld $t7,$a0,$t3 ; ap[0]*bp[i] @@ -761,6 +830,13 @@ Linner: stfd $T0b,`$FRAME+8`($sp) add $t7,$t7,$carry addc $t3,$t0,$t1 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t0,$t0,32,0 + extrdi $t1,$t1,32,0 + adde $t0,$t0,$t1 +___ +$code.=<<___; stfd $T1a,`$FRAME+16`($sp) stfd $T1b,`$FRAME+24`($sp) insrdi $t4,$t7,16,0 ; 64..127 bits @@ -768,6 +844,13 @@ Linner: stfd $T2a,`$FRAME+32`($sp) stfd $T2b,`$FRAME+40`($sp) adde $t5,$t4,$t2 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t4,$t4,32,0 + extrdi $t2,$t2,32,0 + adde $t4,$t4,$t2 +___ +$code.=<<___; stfd $T3a,`$FRAME+48`($sp) stfd $T3b,`$FRAME+56`($sp) addze $carry,$carry @@ -816,7 +899,21 @@ Linner: ld $t7,`$FRAME+72`($sp) addc $t3,$t0,$t1 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t0,$t0,32,0 + extrdi $t1,$t1,32,0 + adde $t0,$t0,$t1 +___ +$code.=<<___; adde $t5,$t4,$t2 +___ +$code.=<<___ if ($SIZE_T==4); # adjust XER[CA] + extrdi $t4,$t4,32,0 + extrdi $t2,$t2,32,0 + adde $t4,$t4,$t2 +___ +$code.=<<___; addze $carry,$carry std $t3,-16($tp) ; tp[j-1] @@ -835,7 +932,9 @@ Linner: subf $nap_d,$t7,$nap_d ; rewind pointer cmpw $i,$num blt- Louter +___ +$code.=<<___ if ($SIZE_T==8); subf $np,$num,$np ; rewind np addi $j,$j,1 ; restore counter subfc $i,$i,$i ; j=0 and "clear" XER[CA] @@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh stdx $i,$t4,$i addi $i,$i,16 bdnz- Lcopy +___ +$code.=<<___ if ($SIZE_T==4); + subf $np,$num,$np ; rewind np + addi $j,$j,1 ; restore counter + subfc $i,$i,$i ; j=0 and "clear" XER[CA] + addi $tp,$sp,`$FRAME+$TRANSFER` + addi $np,$np,-4 + addi $rp,$rp,-4 + addi $ap,$sp,`$FRAME+$TRANSFER+4` + mtctr $j + +.align 4 +Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order + ldu $t2,16($tp) + lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order + lwz $t5,8($np) + lwz $t6,12($np) + lwzu $t7,16($np) + extrdi $t1,$t0,32,0 + extrdi $t3,$t2,32,0 + subfe $t4,$t4,$t0 ; tp[j]-np[j] + stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order + subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1] + stw $t1,8($ap) + subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2] + stw $t2,12($ap) + subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3] + stwu $t3,16($ap) + stw $t4,4($rp) + stw $t5,8($rp) + stw $t6,12($rp) + stwu $t7,16($rp) + bdnz- Lsub + + li $i,0 + subfe $ovf,$i,$ovf ; handle upmost overflow bit + addi $tp,$sp,`$FRAME+$TRANSFER+4` + subf $rp,$num,$rp ; rewind rp + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp + addi $tp,$sp,`$FRAME+$TRANSFER` + mtctr $j + +.align 4 +Lcopy: ; copy or in-place refresh + lwz $t0,4($ap) + lwz $t1,8($ap) + lwz $t2,12($ap) + lwzu $t3,16($ap) + std $i,8($nap_d) ; zap nap_d + std $i,16($nap_d) + std $i,24($nap_d) + std $i,32($nap_d) + std $i,40($nap_d) + std $i,48($nap_d) + std $i,56($nap_d) + stdu $i,64($nap_d) + stw $t0,4($rp) + stw $t1,8($rp) + stw $t2,12($rp) + stwu $t3,16($rp) + std $i,8($tp) ; zap tp at once + stdu $i,16($tp) + bdnz- Lcopy +___ - $POP r14,`2*$SIZE_T`($sp) - $POP r15,`3*$SIZE_T`($sp) - $POP r16,`4*$SIZE_T`($sp) - $POP r17,`5*$SIZE_T`($sp) - $POP r18,`6*$SIZE_T`($sp) - $POP r19,`7*$SIZE_T`($sp) - $POP r20,`8*$SIZE_T`($sp) - $POP r21,`9*$SIZE_T`($sp) - $POP r22,`10*$SIZE_T`($sp) - $POP r23,`11*$SIZE_T`($sp) - lfd f14,`12*$SIZE_T+0`($sp) - lfd f15,`12*$SIZE_T+8`($sp) - lfd f16,`12*$SIZE_T+16`($sp) - lfd f17,`12*$SIZE_T+24`($sp) - lfd f18,`12*$SIZE_T+32`($sp) - lfd f19,`12*$SIZE_T+40`($sp) - lfd f20,`12*$SIZE_T+48`($sp) - lfd f21,`12*$SIZE_T+56`($sp) - lfd f22,`12*$SIZE_T+64`($sp) - lfd f23,`12*$SIZE_T+72`($sp) - lfd f24,`12*$SIZE_T+80`($sp) - lfd f25,`12*$SIZE_T+88`($sp) - $POP $sp,0($sp) +$code.=<<___; + $POP $i,0($sp) li r3,1 ; signal "handled" + $POP r22,`-12*8-10*$SIZE_T`($i) + $POP r23,`-12*8-9*$SIZE_T`($i) + $POP r24,`-12*8-8*$SIZE_T`($i) + $POP r25,`-12*8-7*$SIZE_T`($i) + $POP r26,`-12*8-6*$SIZE_T`($i) + $POP r27,`-12*8-5*$SIZE_T`($i) + $POP r28,`-12*8-4*$SIZE_T`($i) + $POP r29,`-12*8-3*$SIZE_T`($i) + $POP r30,`-12*8-2*$SIZE_T`($i) + $POP r31,`-12*8-1*$SIZE_T`($i) + lfd f20,`-12*8`($i) + lfd f21,`-11*8`($i) + lfd f22,`-10*8`($i) + lfd f23,`-9*8`($i) + lfd f24,`-8*8`($i) + lfd f25,`-7*8`($i) + lfd f26,`-6*8`($i) + lfd f27,`-5*8`($i) + lfd f28,`-4*8`($i) + lfd f29,`-3*8`($i) + lfd f30,`-2*8`($i) + lfd f31,`-1*8`($i) + mr $sp,$i blr .long 0 -.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by " + .byte 0,12,4,0,0x8c,10,6,0 + .long 0 + +.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/src/lib/libcrypto/bn/asm/s390x-gf2m.pl b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl new file mode 100644 index 0000000000..cd9f13eca2 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/s390x-gf2m.pl @@ -0,0 +1,221 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... gcc 4.3 appeared to generate poor code, therefore +# the effort. And indeed, the module delivers 55%-90%(*) improvement +# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit +# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196. +# This is for 64-bit build. In 32-bit "highgprs" case improvement is +# even higher, for example on z990 it was measured 80%-150%. ECDSA +# sign is modest 9%-12% faster. Keep in mind that these coefficients +# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is +# burnt in it... +# +# (*) gcc 4.1 was observed to deliver better results than gcc 4.3, +# so that improvement coefficients can vary from one specific +# setup to another. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$stdframe=16*$SIZE_T+4*8; + +$rp="%r2"; +$a1="%r3"; +$a0="%r4"; +$b1="%r5"; +$b0="%r6"; + +$ra="%r14"; +$sp="%r15"; + +@T=("%r0","%r1"); +@i=("%r12","%r13"); + +($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(6..11)); +($lo,$hi,$b)=map("%r$_",(3..5)); $a=$lo; $mask=$a8; + +$code.=<<___; +.text + +.type _mul_1x1,\@function +.align 16 +_mul_1x1: + lgr $a1,$a + sllg $a2,$a,1 + sllg $a4,$a,2 + sllg $a8,$a,3 + + srag $lo,$a1,63 # broadcast 63rd bit + nihh $a1,0x1fff + srag @i[0],$a2,63 # broadcast 62nd bit + nihh $a2,0x3fff + srag @i[1],$a4,63 # broadcast 61st bit + nihh $a4,0x7fff + ngr $lo,$b + ngr @i[0],$b + ngr @i[1],$b + + lghi @T[0],0 + lgr $a12,$a1 + stg @T[0],`$stdframe+0*8`($sp) # tab[0]=0 + xgr $a12,$a2 + stg $a1,`$stdframe+1*8`($sp) # tab[1]=a1 + lgr $a48,$a4 + stg $a2,`$stdframe+2*8`($sp) # tab[2]=a2 + xgr $a48,$a8 + stg $a12,`$stdframe+3*8`($sp) # tab[3]=a1^a2 + xgr $a1,$a4 + + stg $a4,`$stdframe+4*8`($sp) # tab[4]=a4 + xgr $a2,$a4 + stg $a1,`$stdframe+5*8`($sp) # tab[5]=a1^a4 + xgr $a12,$a4 + stg $a2,`$stdframe+6*8`($sp) # tab[6]=a2^a4 + xgr $a1,$a48 + stg $a12,`$stdframe+7*8`($sp) # tab[7]=a1^a2^a4 + xgr $a2,$a48 + + stg $a8,`$stdframe+8*8`($sp) # tab[8]=a8 + xgr $a12,$a48 + stg $a1,`$stdframe+9*8`($sp) # tab[9]=a1^a8 + xgr $a1,$a4 + stg $a2,`$stdframe+10*8`($sp) # tab[10]=a2^a8 + xgr $a2,$a4 + stg $a12,`$stdframe+11*8`($sp) # tab[11]=a1^a2^a8 + + xgr $a12,$a4 + stg $a48,`$stdframe+12*8`($sp) # tab[12]=a4^a8 + srlg $hi,$lo,1 + stg $a1,`$stdframe+13*8`($sp) # tab[13]=a1^a4^a8 + sllg $lo,$lo,63 + stg $a2,`$stdframe+14*8`($sp) # tab[14]=a2^a4^a8 + srlg @T[0],@i[0],2 + stg $a12,`$stdframe+15*8`($sp) # tab[15]=a1^a2^a4^a8 + + lghi $mask,`0xf<<3` + sllg $a1,@i[0],62 + sllg @i[0],$b,3 + srlg @T[1],@i[1],3 + ngr @i[0],$mask + sllg $a2,@i[1],61 + srlg @i[1],$b,4-3 + xgr $hi,@T[0] + ngr @i[1],$mask + xgr $lo,$a1 + xgr $hi,@T[1] + xgr $lo,$a2 + + xg $lo,$stdframe(@i[0],$sp) + srlg @i[0],$b,8-3 + ngr @i[0],$mask +___ +for($n=1;$n<14;$n++) { +$code.=<<___; + lg @T[1],$stdframe(@i[1],$sp) + srlg @i[1],$b,`($n+2)*4`-3 + sllg @T[0],@T[1],`$n*4` + ngr @i[1],$mask + srlg @T[1],@T[1],`64-$n*4` + xgr $lo,@T[0] + xgr $hi,@T[1] +___ + push(@i,shift(@i)); push(@T,shift(@T)); +} +$code.=<<___; + lg @T[1],$stdframe(@i[1],$sp) + sllg @T[0],@T[1],`$n*4` + srlg @T[1],@T[1],`64-$n*4` + xgr $lo,@T[0] + xgr $hi,@T[1] + + lg @T[0],$stdframe(@i[0],$sp) + sllg @T[1],@T[0],`($n+1)*4` + srlg @T[0],@T[0],`64-($n+1)*4` + xgr $lo,@T[1] + xgr $hi,@T[0] + + br $ra +.size _mul_1x1,.-_mul_1x1 + +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,\@function +.align 16 +bn_GF2m_mul_2x2: + stm${g} %r3,%r15,3*$SIZE_T($sp) + + lghi %r1,-$stdframe-128 + la %r0,0($sp) + la $sp,0(%r1,$sp) # alloca + st${g} %r0,0($sp) # back chain +___ +if ($SIZE_T==8) { +my @r=map("%r$_",(6..9)); +$code.=<<___; + bras $ra,_mul_1x1 # a1·b1 + stmg $lo,$hi,16($rp) + + lg $a,`$stdframe+128+4*$SIZE_T`($sp) + lg $b,`$stdframe+128+6*$SIZE_T`($sp) + bras $ra,_mul_1x1 # a0·b0 + stmg $lo,$hi,0($rp) + + lg $a,`$stdframe+128+3*$SIZE_T`($sp) + lg $b,`$stdframe+128+5*$SIZE_T`($sp) + xg $a,`$stdframe+128+4*$SIZE_T`($sp) + xg $b,`$stdframe+128+6*$SIZE_T`($sp) + bras $ra,_mul_1x1 # (a0+a1)·(b0+b1) + lmg @r[0],@r[3],0($rp) + + xgr $lo,$hi + xgr $hi,@r[1] + xgr $lo,@r[0] + xgr $hi,@r[2] + xgr $lo,@r[3] + xgr $hi,@r[3] + xgr $lo,$hi + stg $hi,16($rp) + stg $lo,8($rp) +___ +} else { +$code.=<<___; + sllg %r3,%r3,32 + sllg %r5,%r5,32 + or %r3,%r4 + or %r5,%r6 + bras $ra,_mul_1x1 + rllg $lo,$lo,32 + rllg $hi,$hi,32 + stmg $lo,$hi,0($rp) +___ +} +$code.=<<___; + lm${g} %r6,%r15,`$stdframe+128+6*$SIZE_T`($sp) + br $ra +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.string "GF(2^m) Multiplication for s390x, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/s390x-mont.pl b/src/lib/libcrypto/bn/asm/s390x-mont.pl index f61246f5b6..9fd64e81ee 100644 --- a/src/lib/libcrypto/bn/asm/s390x-mont.pl +++ b/src/lib/libcrypto/bn/asm/s390x-mont.pl @@ -32,6 +32,33 @@ # Reschedule to minimize/avoid Address Generation Interlock hazard, # make inner loops counter-based. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG +# is achieved by swapping words after 64-bit loads, follow _dswap-s. +# On z990 it was measured to perform 2.6-2.2 times better than +# compiler-generated code, less for longer keys... + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$stdframe=16*$SIZE_T+4*8; + $mn0="%r0"; $num="%r1"; @@ -60,34 +87,44 @@ $code.=<<___; .globl bn_mul_mont .type bn_mul_mont,\@function bn_mul_mont: - lgf $num,164($sp) # pull $num - sla $num,3 # $num to enumerate bytes + lgf $num,`$stdframe+$SIZE_T-4`($sp) # pull $num + sla $num,`log($SIZE_T)/log(2)` # $num to enumerate bytes la $bp,0($num,$bp) - stg %r2,16($sp) + st${g} %r2,2*$SIZE_T($sp) cghi $num,16 # lghi %r2,0 # blr %r14 # if($num<16) return 0; +___ +$code.=<<___ if ($flavour =~ /3[12]/); + tmll $num,4 + bnzr %r14 # if ($num&1) return 0; +___ +$code.=<<___ if ($flavour !~ /3[12]/); cghi $num,96 # bhr %r14 # if($num>96) return 0; +___ +$code.=<<___; + stm${g} %r3,%r15,3*$SIZE_T($sp) - stmg %r3,%r15,24($sp) - - lghi $rp,-160-8 # leave room for carry bit + lghi $rp,-$stdframe-8 # leave room for carry bit lcgr $j,$num # -$num lgr %r0,$sp la $rp,0($rp,$sp) la $sp,0($j,$rp) # alloca - stg %r0,0($sp) # back chain + st${g} %r0,0($sp) # back chain sra $num,3 # restore $num la $bp,0($j,$bp) # restore $bp ahi $num,-1 # adjust $num for inner loop lg $n0,0($n0) # pull n0 + _dswap $n0 lg $bi,0($bp) + _dswap $bi lg $alo,0($ap) + _dswap $alo mlgr $ahi,$bi # ap[0]*bp[0] lgr $AHI,$ahi @@ -95,6 +132,7 @@ bn_mul_mont: msgr $mn0,$n0 lg $nlo,0($np) # + _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 @@ -106,12 +144,14 @@ bn_mul_mont: .align 16 .L1st: lg $alo,0($j,$ap) + _dswap $alo mlgr $ahi,$bi # ap[j]*bp[0] algr $alo,$AHI lghi $AHI,0 alcgr $AHI,$ahi lg $nlo,0($j,$np) + _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 @@ -119,22 +159,24 @@ bn_mul_mont: algr $nlo,$alo alcgr $NHI,$nhi - stg $nlo,160-8($j,$sp) # tp[j-1]= + stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.L1st algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI # upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) + stg $NHI,$stdframe-8($j,$sp) + stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ .Louter: lg $bi,0($bp) # bp[i] + _dswap $bi lg $alo,0($ap) + _dswap $alo mlgr $ahi,$bi # ap[0]*bp[i] - alg $alo,160($sp) # +=tp[0] + alg $alo,$stdframe($sp) # +=tp[0] lghi $AHI,0 alcgr $AHI,$ahi @@ -142,6 +184,7 @@ bn_mul_mont: msgr $mn0,$n0 # tp[0]*n0 lg $nlo,0($np) # np[0] + _dswap $nlo mlgr $nhi,$mn0 # np[0]*m1 algr $nlo,$alo # +="tp[0]" lghi $NHI,0 @@ -153,14 +196,16 @@ bn_mul_mont: .align 16 .Linner: lg $alo,0($j,$ap) + _dswap $alo mlgr $ahi,$bi # ap[j]*bp[i] algr $alo,$AHI lghi $AHI,0 alcgr $ahi,$AHI - alg $alo,160($j,$sp)# +=tp[j] + alg $alo,$stdframe($j,$sp)# +=tp[j] alcgr $AHI,$ahi lg $nlo,0($j,$np) + _dswap $nlo mlgr $nhi,$mn0 # np[j]*m1 algr $nlo,$NHI lghi $NHI,0 @@ -168,31 +213,33 @@ bn_mul_mont: algr $nlo,$alo # +="tp[j]" alcgr $NHI,$nhi - stg $nlo,160-8($j,$sp) # tp[j-1]= + stg $nlo,$stdframe-8($j,$sp) # tp[j-1]= la $j,8($j) # j++ brct $count,.Linner algr $NHI,$AHI lghi $AHI,0 alcgr $AHI,$AHI - alg $NHI,160($j,$sp)# accumulate previous upmost overflow bit + alg $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit lghi $ahi,0 alcgr $AHI,$ahi # new upmost overflow bit - stg $NHI,160-8($j,$sp) - stg $AHI,160($j,$sp) + stg $NHI,$stdframe-8($j,$sp) + stg $AHI,$stdframe($j,$sp) la $bp,8($bp) # bp++ - clg $bp,160+8+32($j,$sp) # compare to &bp[num] + cl${g} $bp,`$stdframe+8+4*$SIZE_T`($j,$sp) # compare to &bp[num] jne .Louter - lg $rp,160+8+16($j,$sp) # reincarnate rp - la $ap,160($sp) + l${g} $rp,`$stdframe+8+2*$SIZE_T`($j,$sp) # reincarnate rp + la $ap,$stdframe($sp) ahi $num,1 # restore $num, incidentally clears "borrow" la $j,0(%r0) lr $count,$num .Lsub: lg $alo,0($j,$ap) - slbg $alo,0($j,$np) + lg $nlo,0($j,$np) + _dswap $nlo + slbgr $alo,$nlo stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lsub @@ -207,19 +254,24 @@ bn_mul_mont: la $j,0(%r0) lgr $count,$num -.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh - stg $j,160($j,$sp) # zap tp +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh + _dswap $alo + stg $j,$stdframe($j,$sp) # zap tp stg $alo,0($j,$rp) la $j,8($j) brct $count,.Lcopy - la %r1,160+8+48($j,$sp) - lmg %r6,%r15,0(%r1) + la %r1,`$stdframe+8+6*$SIZE_T`($j,$sp) + lm${g} %r6,%r15,0(%r1) lghi %r2,1 # signal "processed" br %r14 .size bn_mul_mont,.-bn_mul_mont .string "Montgomery Multiplication for s390x, CRYPTOGAMS by " ___ -print $code; +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e; + print $_,"\n"; +} close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/x86-gf2m.pl b/src/lib/libcrypto/bn/asm/x86-gf2m.pl new file mode 100644 index 0000000000..808a1e5969 --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86-gf2m.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... Except that it has three code paths: pure integer +# code suitable for any x86 CPU, MMX code suitable for PIII and later +# and PCLMULQDQ suitable for Westmere and later. Improvement varies +# from one benchmark and µ-arch to another. Below are interval values +# for 163- and 571-bit ECDH benchmarks relative to compiler-generated +# code: +# +# PIII 16%-30% +# P4 12%-12% +# Opteron 18%-40% +# Core2 19%-44% +# Atom 38%-64% +# Westmere 53%-121%(PCLMULQDQ)/20%-32%(MMX) +# Sandy Bridge 72%-127%(PCLMULQDQ)/27%-23%(MMX) +# +# Note that above improvement coefficients are not coefficients for +# bn_GF2m_mul_2x2 itself. For example 120% ECDH improvement is result +# of bn_GF2m_mul_2x2 being >4x faster. As it gets faster, benchmark +# is more and more dominated by other subroutines, most notably by +# BN_GF2m_mod[_mul]_arr... + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],$0,$x86only = $ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +&external_label("OPENSSL_ia32cap_P") if ($sse2); + +$a="eax"; +$b="ebx"; +($a1,$a2,$a4)=("ecx","edx","ebp"); + +$R="mm0"; +@T=("mm1","mm2"); +($A,$B,$B30,$B31)=("mm2","mm3","mm4","mm5"); +@i=("esi","edi"); + + if (!$x86only) { +&function_begin_B("_mul_1x1_mmx"); + &sub ("esp",32+4); + &mov ($a1,$a); + &lea ($a2,&DWP(0,$a,$a)); + &and ($a1,0x3fffffff); + &lea ($a4,&DWP(0,$a2,$a2)); + &mov (&DWP(0*4,"esp"),0); + &and ($a2,0x7fffffff); + &movd ($A,$a); + &movd ($B,$b); + &mov (&DWP(1*4,"esp"),$a1); # a1 + &xor ($a1,$a2); # a1^a2 + &pxor ($B31,$B31); + &pxor ($B30,$B30); + &mov (&DWP(2*4,"esp"),$a2); # a2 + &xor ($a2,$a4); # a2^a4 + &mov (&DWP(3*4,"esp"),$a1); # a1^a2 + &pcmpgtd($B31,$A); # broadcast 31st bit + &paddd ($A,$A); # $A<<=1 + &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 + &mov (&DWP(4*4,"esp"),$a4); # a4 + &xor ($a4,$a2); # a2=a4^a2^a4 + &pand ($B31,$B); + &pcmpgtd($B30,$A); # broadcast 30th bit + &mov (&DWP(5*4,"esp"),$a1); # a1^a4 + &xor ($a4,$a1); # a1^a2^a4 + &psllq ($B31,31); + &pand ($B30,$B); + &mov (&DWP(6*4,"esp"),$a2); # a2^a4 + &mov (@i[0],0x7); + &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 + &mov ($a4,@i[0]); + &and (@i[0],$b); + &shr ($b,3); + &mov (@i[1],$a4); + &psllq ($B30,30); + &and (@i[1],$b); + &shr ($b,3); + &movd ($R,&DWP(0,"esp",@i[0],4)); + &mov (@i[0],$a4); + &and (@i[0],$b); + &shr ($b,3); + for($n=1;$n<9;$n++) { + &movd (@T[1],&DWP(0,"esp",@i[1],4)); + &mov (@i[1],$a4); + &psllq (@T[1],3*$n); + &and (@i[1],$b); + &shr ($b,3); + &pxor ($R,@T[1]); + + push(@i,shift(@i)); push(@T,shift(@T)); + } + &movd (@T[1],&DWP(0,"esp",@i[1],4)); + &pxor ($R,$B30); + &psllq (@T[1],3*$n++); + &pxor ($R,@T[1]); + + &movd (@T[0],&DWP(0,"esp",@i[0],4)); + &pxor ($R,$B31); + &psllq (@T[0],3*$n); + &add ("esp",32+4); + &pxor ($R,@T[0]); + &ret (); +&function_end_B("_mul_1x1_mmx"); + } + +($lo,$hi)=("eax","edx"); +@T=("ecx","ebp"); + +&function_begin_B("_mul_1x1_ialu"); + &sub ("esp",32+4); + &mov ($a1,$a); + &lea ($a2,&DWP(0,$a,$a)); + &lea ($a4,&DWP(0,"",$a,4)); + &and ($a1,0x3fffffff); + &lea (@i[1],&DWP(0,$lo,$lo)); + &sar ($lo,31); # broadcast 31st bit + &mov (&DWP(0*4,"esp"),0); + &and ($a2,0x7fffffff); + &mov (&DWP(1*4,"esp"),$a1); # a1 + &xor ($a1,$a2); # a1^a2 + &mov (&DWP(2*4,"esp"),$a2); # a2 + &xor ($a2,$a4); # a2^a4 + &mov (&DWP(3*4,"esp"),$a1); # a1^a2 + &xor ($a1,$a2); # a1^a4=a1^a2^a2^a4 + &mov (&DWP(4*4,"esp"),$a4); # a4 + &xor ($a4,$a2); # a2=a4^a2^a4 + &mov (&DWP(5*4,"esp"),$a1); # a1^a4 + &xor ($a4,$a1); # a1^a2^a4 + &sar (@i[1],31); # broardcast 30th bit + &and ($lo,$b); + &mov (&DWP(6*4,"esp"),$a2); # a2^a4 + &and (@i[1],$b); + &mov (&DWP(7*4,"esp"),$a4); # a1^a2^a4 + &mov ($hi,$lo); + &shl ($lo,31); + &mov (@T[0],@i[1]); + &shr ($hi,1); + + &mov (@i[0],0x7); + &shl (@i[1],30); + &and (@i[0],$b); + &shr (@T[0],2); + &xor ($lo,@i[1]); + + &shr ($b,3); + &mov (@i[1],0x7); # 5-byte instruction!? + &and (@i[1],$b); + &shr ($b,3); + &xor ($hi,@T[0]); + &xor ($lo,&DWP(0,"esp",@i[0],4)); + &mov (@i[0],0x7); + &and (@i[0],$b); + &shr ($b,3); + for($n=1;$n<9;$n++) { + &mov (@T[1],&DWP(0,"esp",@i[1],4)); + &mov (@i[1],0x7); + &mov (@T[0],@T[1]); + &shl (@T[1],3*$n); + &and (@i[1],$b); + &shr (@T[0],32-3*$n); + &xor ($lo,@T[1]); + &shr ($b,3); + &xor ($hi,@T[0]); + + push(@i,shift(@i)); push(@T,shift(@T)); + } + &mov (@T[1],&DWP(0,"esp",@i[1],4)); + &mov (@T[0],@T[1]); + &shl (@T[1],3*$n); + &mov (@i[1],&DWP(0,"esp",@i[0],4)); + &shr (@T[0],32-3*$n); $n++; + &mov (@i[0],@i[1]); + &xor ($lo,@T[1]); + &shl (@i[1],3*$n); + &xor ($hi,@T[0]); + &shr (@i[0],32-3*$n); + &xor ($lo,@i[1]); + &xor ($hi,@i[0]); + + &add ("esp",32+4); + &ret (); +&function_end_B("_mul_1x1_ialu"); + +# void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); +&function_begin_B("bn_GF2m_mul_2x2"); +if (!$x86only) { + &picmeup("edx","OPENSSL_ia32cap_P"); + &mov ("eax",&DWP(0,"edx")); + &mov ("edx",&DWP(4,"edx")); + &test ("eax",1<<23); # check MMX bit + &jz (&label("ialu")); +if ($sse2) { + &test ("eax",1<<24); # check FXSR bit + &jz (&label("mmx")); + &test ("edx",1<<1); # check PCLMULQDQ bit + &jz (&label("mmx")); + + &movups ("xmm0",&QWP(8,"esp")); + &shufps ("xmm0","xmm0",0b10110001); + &pclmulqdq ("xmm0","xmm0",1); + &mov ("eax",&DWP(4,"esp")); + &movups (&QWP(0,"eax"),"xmm0"); + &ret (); + +&set_label("mmx",16); +} + &push ("ebp"); + &push ("ebx"); + &push ("esi"); + &push ("edi"); + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &call ("_mul_1x1_mmx"); # a1·b1 + &movq ("mm7",$R); + + &mov ($a,&wparam(2)); + &mov ($b,&wparam(4)); + &call ("_mul_1x1_mmx"); # a0·b0 + &movq ("mm6",$R); + + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &xor ($a,&wparam(2)); + &xor ($b,&wparam(4)); + &call ("_mul_1x1_mmx"); # (a0+a1)·(b0+b1) + &pxor ($R,"mm7"); + &mov ($a,&wparam(0)); + &pxor ($R,"mm6"); # (a0+a1)·(b0+b1)-a1·b1-a0·b0 + + &movq ($A,$R); + &psllq ($R,32); + &pop ("edi"); + &psrlq ($A,32); + &pop ("esi"); + &pxor ($R,"mm6"); + &pop ("ebx"); + &pxor ($A,"mm7"); + &movq (&QWP(0,$a),$R); + &pop ("ebp"); + &movq (&QWP(8,$a),$A); + &emms (); + &ret (); +&set_label("ialu",16); +} + &push ("ebp"); + &push ("ebx"); + &push ("esi"); + &push ("edi"); + &stack_push(4+1); + + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &call ("_mul_1x1_ialu"); # a1·b1 + &mov (&DWP(8,"esp"),$lo); + &mov (&DWP(12,"esp"),$hi); + + &mov ($a,&wparam(2)); + &mov ($b,&wparam(4)); + &call ("_mul_1x1_ialu"); # a0·b0 + &mov (&DWP(0,"esp"),$lo); + &mov (&DWP(4,"esp"),$hi); + + &mov ($a,&wparam(1)); + &mov ($b,&wparam(3)); + &xor ($a,&wparam(2)); + &xor ($b,&wparam(4)); + &call ("_mul_1x1_ialu"); # (a0+a1)·(b0+b1) + + &mov ("ebp",&wparam(0)); + @r=("ebx","ecx","edi","esi"); + &mov (@r[0],&DWP(0,"esp")); + &mov (@r[1],&DWP(4,"esp")); + &mov (@r[2],&DWP(8,"esp")); + &mov (@r[3],&DWP(12,"esp")); + + &xor ($lo,$hi); + &xor ($hi,@r[1]); + &xor ($lo,@r[0]); + &mov (&DWP(0,"ebp"),@r[0]); + &xor ($hi,@r[2]); + &mov (&DWP(12,"ebp"),@r[3]); + &xor ($lo,@r[3]); + &stack_pop(4+1); + &xor ($hi,@r[3]); + &pop ("edi"); + &xor ($lo,$hi); + &pop ("esi"); + &mov (&DWP(8,"ebp"),$hi); + &pop ("ebx"); + &mov (&DWP(4,"ebp"),$lo); + &pop ("ebp"); + &ret (); +&function_end_B("bn_GF2m_mul_2x2"); + +&asciz ("GF(2^m) Multiplication for x86, CRYPTOGAMS by "); + +&asm_finish(); diff --git a/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl new file mode 100644 index 0000000000..1658acbbdd --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-gf2m.pl @@ -0,0 +1,389 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# May 2011 +# +# The module implements bn_GF2m_mul_2x2 polynomial multiplication used +# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for +# the time being... Except that it has two code paths: code suitable +# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and +# later. Improvement varies from one benchmark and µ-arch to another. +# Vanilla code path is at most 20% faster than compiler-generated code +# [not very impressive], while PCLMULQDQ - whole 85%-160% better on +# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that +# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not +# all CPU time is burnt in it... + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +($lo,$hi)=("%rax","%rdx"); $a=$lo; +($i0,$i1)=("%rsi","%rdi"); +($t0,$t1)=("%rbx","%rcx"); +($b,$mask)=("%rbp","%r8"); +($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15)); +($R,$Tx)=("%xmm0","%xmm1"); + +$code.=<<___; +.text + +.type _mul_1x1,\@abi-omnipotent +.align 16 +_mul_1x1: + sub \$128+8,%rsp + mov \$-1,$a1 + lea ($a,$a),$i0 + shr \$3,$a1 + lea (,$a,4),$i1 + and $a,$a1 # a1=a&0x1fffffffffffffff + lea (,$a,8),$a8 + sar \$63,$a # broadcast 63rd bit + lea ($a1,$a1),$a2 + sar \$63,$i0 # broadcast 62nd bit + lea (,$a1,4),$a4 + and $b,$a + sar \$63,$i1 # boardcast 61st bit + mov $a,$hi # $a is $lo + shl \$63,$lo + and $b,$i0 + shr \$1,$hi + mov $i0,$t1 + shl \$62,$i0 + and $b,$i1 + shr \$2,$t1 + xor $i0,$lo + mov $i1,$t0 + shl \$61,$i1 + xor $t1,$hi + shr \$3,$t0 + xor $i1,$lo + xor $t0,$hi + + mov $a1,$a12 + movq \$0,0(%rsp) # tab[0]=0 + xor $a2,$a12 # a1^a2 + mov $a1,8(%rsp) # tab[1]=a1 + mov $a4,$a48 + mov $a2,16(%rsp) # tab[2]=a2 + xor $a8,$a48 # a4^a8 + mov $a12,24(%rsp) # tab[3]=a1^a2 + + xor $a4,$a1 + mov $a4,32(%rsp) # tab[4]=a4 + xor $a4,$a2 + mov $a1,40(%rsp) # tab[5]=a1^a4 + xor $a4,$a12 + mov $a2,48(%rsp) # tab[6]=a2^a4 + xor $a48,$a1 # a1^a4^a4^a8=a1^a8 + mov $a12,56(%rsp) # tab[7]=a1^a2^a4 + xor $a48,$a2 # a2^a4^a4^a8=a1^a8 + + mov $a8,64(%rsp) # tab[8]=a8 + xor $a48,$a12 # a1^a2^a4^a4^a8=a1^a2^a8 + mov $a1,72(%rsp) # tab[9]=a1^a8 + xor $a4,$a1 # a1^a8^a4 + mov $a2,80(%rsp) # tab[10]=a2^a8 + xor $a4,$a2 # a2^a8^a4 + mov $a12,88(%rsp) # tab[11]=a1^a2^a8 + + xor $a4,$a12 # a1^a2^a8^a4 + mov $a48,96(%rsp) # tab[12]=a4^a8 + mov $mask,$i0 + mov $a1,104(%rsp) # tab[13]=a1^a4^a8 + and $b,$i0 + mov $a2,112(%rsp) # tab[14]=a2^a4^a8 + shr \$4,$b + mov $a12,120(%rsp) # tab[15]=a1^a2^a4^a8 + mov $mask,$i1 + and $b,$i1 + shr \$4,$b + + movq (%rsp,$i0,8),$R # half of calculations is done in SSE2 + mov $mask,$i0 + and $b,$i0 + shr \$4,$b +___ + for ($n=1;$n<8;$n++) { + $code.=<<___; + mov (%rsp,$i1,8),$t1 + mov $mask,$i1 + mov $t1,$t0 + shl \$`8*$n-4`,$t1 + and $b,$i1 + movq (%rsp,$i0,8),$Tx + shr \$`64-(8*$n-4)`,$t0 + xor $t1,$lo + pslldq \$$n,$Tx + mov $mask,$i0 + shr \$4,$b + xor $t0,$hi + and $b,$i0 + shr \$4,$b + pxor $Tx,$R +___ + } +$code.=<<___; + mov (%rsp,$i1,8),$t1 + mov $t1,$t0 + shl \$`8*$n-4`,$t1 + movq $R,$i0 + shr \$`64-(8*$n-4)`,$t0 + xor $t1,$lo + psrldq \$8,$R + xor $t0,$hi + movq $R,$i1 + xor $i0,$lo + xor $i1,$hi + + add \$128+8,%rsp + ret +.Lend_mul_1x1: +.size _mul_1x1,.-_mul_1x1 +___ + +($rp,$a1,$a0,$b1,$b0) = $win64? ("%rcx","%rdx","%r8", "%r9","%r10") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx","%r8"); # Unix order + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl bn_GF2m_mul_2x2 +.type bn_GF2m_mul_2x2,\@abi-omnipotent +.align 16 +bn_GF2m_mul_2x2: + mov OPENSSL_ia32cap_P(%rip),%rax + bt \$33,%rax + jnc .Lvanilla_mul_2x2 + + movq $a1,%xmm0 + movq $b1,%xmm1 + movq $a0,%xmm2 +___ +$code.=<<___ if ($win64); + movq 40(%rsp),%xmm3 +___ +$code.=<<___ if (!$win64); + movq $b0,%xmm3 +___ +$code.=<<___; + movdqa %xmm0,%xmm4 + movdqa %xmm1,%xmm5 + pclmulqdq \$0,%xmm1,%xmm0 # a1·b1 + pxor %xmm2,%xmm4 + pxor %xmm3,%xmm5 + pclmulqdq \$0,%xmm3,%xmm2 # a0·b0 + pclmulqdq \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1) + xorps %xmm0,%xmm4 + xorps %xmm2,%xmm4 # (a0+a1)·(b0+b1)-a0·b0-a1·b1 + movdqa %xmm4,%xmm5 + pslldq \$8,%xmm4 + psrldq \$8,%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm0 + movdqu %xmm2,0($rp) + movdqu %xmm0,16($rp) + ret + +.align 16 +.Lvanilla_mul_2x2: + lea -8*17(%rsp),%rsp +___ +$code.=<<___ if ($win64); + mov `8*17+40`(%rsp),$b0 + mov %rdi,8*15(%rsp) + mov %rsi,8*16(%rsp) +___ +$code.=<<___; + mov %r14,8*10(%rsp) + mov %r13,8*11(%rsp) + mov %r12,8*12(%rsp) + mov %rbp,8*13(%rsp) + mov %rbx,8*14(%rsp) +.Lbody_mul_2x2: + mov $rp,32(%rsp) # save the arguments + mov $a1,40(%rsp) + mov $a0,48(%rsp) + mov $b1,56(%rsp) + mov $b0,64(%rsp) + + mov \$0xf,$mask + mov $a1,$a + mov $b1,$b + call _mul_1x1 # a1·b1 + mov $lo,16(%rsp) + mov $hi,24(%rsp) + + mov 48(%rsp),$a + mov 64(%rsp),$b + call _mul_1x1 # a0·b0 + mov $lo,0(%rsp) + mov $hi,8(%rsp) + + mov 40(%rsp),$a + mov 56(%rsp),$b + xor 48(%rsp),$a + xor 64(%rsp),$b + call _mul_1x1 # (a0+a1)·(b0+b1) +___ + @r=("%rbx","%rcx","%rdi","%rsi"); +$code.=<<___; + mov 0(%rsp),@r[0] + mov 8(%rsp),@r[1] + mov 16(%rsp),@r[2] + mov 24(%rsp),@r[3] + mov 32(%rsp),%rbp + + xor $hi,$lo + xor @r[1],$hi + xor @r[0],$lo + mov @r[0],0(%rbp) + xor @r[2],$hi + mov @r[3],24(%rbp) + xor @r[3],$lo + xor @r[3],$hi + xor $hi,$lo + mov $hi,16(%rbp) + mov $lo,8(%rbp) + + mov 8*10(%rsp),%r14 + mov 8*11(%rsp),%r13 + mov 8*12(%rsp),%r12 + mov 8*13(%rsp),%rbp + mov 8*14(%rsp),%rbx +___ +$code.=<<___ if ($win64); + mov 8*15(%rsp),%rdi + mov 8*16(%rsp),%rsi +___ +$code.=<<___; + lea 8*17(%rsp),%rsp + ret +.Lend_mul_2x2: +.size bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2 +.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by " +.align 16 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind + +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 152($context),%rax # pull context->Rsp + mov 248($context),%rbx # pull context->Rip + + lea .Lbody_mul_2x2(%rip),%r10 + cmp %r10,%rbx # context->Rip<"prologue" label + jb .Lin_prologue + + mov 8*10(%rax),%r14 # mimic epilogue + mov 8*11(%rax),%r13 + mov 8*12(%rax),%r12 + mov 8*13(%rax),%rbp + mov 8*14(%rax),%rbx + mov 8*15(%rax),%rdi + mov 8*16(%rax),%rsi + + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + +.Lin_prologue: + lea 8*17(%rax),%rax + mov %rax,152($context) # restore context->Rsp + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva _mul_1x1 + .rva .Lend_mul_1x1 + .rva .LSEH_info_1x1 + + .rva .Lvanilla_mul_2x2 + .rva .Lend_mul_2x2 + .rva .LSEH_info_2x2 +.section .xdata +.align 8 +.LSEH_info_1x1: + .byte 0x01,0x07,0x02,0x00 + .byte 0x07,0x01,0x11,0x00 # sub rsp,128+8 +.LSEH_info_2x2: + .byte 9,0,0,0 + .rva se_handler +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont.pl b/src/lib/libcrypto/bn/asm/x86_64-mont.pl index 3b7a6f243f..5d79b35e1c 100755 --- a/src/lib/libcrypto/bn/asm/x86_64-mont.pl +++ b/src/lib/libcrypto/bn/asm/x86_64-mont.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -15,6 +15,20 @@ # respectful 50%. It remains to be seen if loop unrolling and # dedicated squaring routine can provide further improvement... +# July 2011. +# +# Add dedicated squaring procedure. Performance improvement varies +# from platform to platform, but in average it's ~5%/15%/25%/33% +# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. + +# August 2011. +# +# Unroll and modulo-schedule inner loops in such manner that they +# are "fallen through" for input lengths of 8, which is critical for +# 1024-bit RSA *sign*. Average performance improvement in comparison +# to *initial* version of this module from 2005 is ~0%/30%/40%/45% +# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -37,7 +51,6 @@ $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num); $lo0="%r10"; $hi0="%r11"; -$bp="%r12"; # reassign $bp $hi1="%r13"; $i="%r14"; $j="%r15"; @@ -51,6 +64,16 @@ $code=<<___; .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: + test \$3,${num}d + jnz .Lmul_enter + cmp \$8,${num}d + jb .Lmul_enter + cmp $ap,$bp + jne .Lmul4x_enter + jmp .Lsqr4x_enter + +.align 16 +.Lmul_enter: push %rbx push %rbp push %r12 @@ -66,48 +89,66 @@ bn_mul_mont: and \$-1024,%rsp # minimize TLB usage mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp -.Lprologue: - mov %rdx,$bp # $bp reassigned, remember? - +.Lmul_body: + mov $bp,%r12 # reassign $bp +___ + $bp="%r12"; +$code.=<<___; mov ($n0),$n0 # pull n0[0] value + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 - mov ($bp),$m0 # m0=bp[0] - mov ($ap),%rax + mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 - mov %rdx,$hi0 + mov ($np),%rax - imulq $n0,%rax # "tp[0]"*n0 - mov %rax,$m1 + imulq $lo0,$m1 # "tp[0]"*n0 + mov %rdx,$hi0 - mulq ($np) # np[0]*m1 - add $lo0,%rax # discarded + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ + jmp .L1st_enter + +.align 16 .L1st: + add %rax,$hi1 mov ($ap,$j,8),%rax - mulq $m0 # ap[j]*bp[0] - add $hi0,%rax adc \$0,%rdx - mov %rax,$lo0 + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + mov $lo0,$hi0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.L1st_enter: + mulq $m0 # ap[j]*bp[0] + add %rax,$hi0 mov ($np,$j,8),%rax - mov %rdx,$hi0 + adc \$0,%rdx + lea 1($j),$j # j++ + mov %rdx,$lo0 mulq $m1 # np[j]*m1 - add $hi1,%rax - lea 1($j),$j # j++ + cmp $num,$j + jne .L1st + + add %rax,$hi1 + mov ($ap),%rax # ap[0] adc \$0,%rdx - add $lo0,%rax # np[j]*m1+ap[j]*bp[0] + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx - mov %rax,-16(%rsp,$j,8) # tp[j-1] - cmp $num,$j + mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 - jl .L1st + mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 @@ -116,50 +157,64 @@ bn_mul_mont: mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ -.align 4 + jmp .Louter +.align 16 .Louter: - xor $j,$j # j=0 - mov ($bp,$i,8),$m0 # m0=bp[i] - mov ($ap),%rax # ap[0] + xor $j,$j # j=0 + mov $n0,$m1 + mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] - add (%rsp),%rax # ap[0]*bp[i]+tp[0] + add %rax,$lo0 # ap[0]*bp[i]+tp[0] + mov ($np),%rax adc \$0,%rdx - mov %rax,$lo0 - mov %rdx,$hi0 - imulq $n0,%rax # tp[0]*n0 - mov %rax,$m1 + imulq $lo0,$m1 # tp[0]*n0 + mov %rdx,$hi0 - mulq ($np,$j,8) # np[0]*m1 - add $lo0,%rax # discarded - mov 8(%rsp),$lo0 # tp[1] + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax adc \$0,%rdx + mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ -.align 4 + jmp .Linner_enter + +.align 16 .Linner: + add %rax,$hi1 mov ($ap,$j,8),%rax - mulq $m0 # ap[j]*bp[i] - add $hi0,%rax adc \$0,%rdx - add %rax,$lo0 # ap[j]*bp[i]+tp[j] + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.Linner_enter: + mulq $m0 # ap[j]*bp[i] + add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx + add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 + adc \$0,$hi0 + lea 1($j),$j # j++ mulq $m1 # np[j]*m1 - add $hi1,%rax - lea 1($j),$j # j++ - adc \$0,%rdx - add $lo0,%rax # np[j]*m1+ap[j]*bp[i]+tp[j] + cmp $num,$j + jne .Linner + + add %rax,$hi1 + mov ($ap),%rax # ap[0] adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 - cmp $num,$j - mov %rax,-16(%rsp,$j,8) # tp[j-1] + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 - jl .Linner xor %rdx,%rdx add $hi0,$hi1 @@ -173,35 +228,449 @@ bn_mul_mont: cmp $num,$i jl .Louter - lea (%rsp),$ap # borrow ap for tp - lea -1($num),$j # j=num-1 - - mov ($ap),%rax # tp[0] xor $i,$i # i=0 and clear CF! + mov (%rsp),%rax # tp[0] + lea (%rsp),$ap # borrow ap for tp + mov $num,$j # j=num jmp .Lsub .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] - dec $j # doesn't affect CF! mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ - jge .Lsub + dec $j # doesnn't affect CF! + jnz .Lsub sbb \$0,%rax # handle upmost overflow bit + xor $i,$i and %rax,$ap not %rax mov $rp,$np and %rax,$np - lea -1($num),$j + mov $num,$j # j=num or $np,$ap # ap=borrow?tp:rp .align 16 .Lcopy: # copy or in-place refresh + mov ($ap,$i,8),%rax + mov $i,(%rsp,$i,8) # zap temporary vector + mov %rax,($rp,$i,8) # rp[i]=tp[i] + lea 1($i),$i + sub \$1,$j + jnz .Lcopy + + mov 8(%rsp,$num,8),%rsi # restore %rsp + mov \$1,%rax + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmul_epilogue: + ret +.size bn_mul_mont,.-bn_mul_mont +___ +{{{ +my @A=("%r10","%r11"); +my @N=("%r13","%rdi"); +$code.=<<___; +.type bn_mul4x_mont,\@function,6 +.align 16 +bn_mul4x_mont: +.Lmul4x_enter: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + mov ${num}d,${num}d + lea 4($num),%r10 + mov %rsp,%r11 + neg %r10 + lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4)) + and \$-1024,%rsp # minimize TLB usage + + mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp + mov %rdx,%r12 # reassign $bp +___ + $bp="%r12"; +$code.=<<___; + mov ($n0),$n0 # pull n0[0] value + mov ($bp),$m0 # m0=bp[0] + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$A[0] + mov ($np),%rax + + imulq $A[0],$m1 # "tp[0]"*n0 + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 4($j),$j # j++ + adc \$0,%rdx + mov $N[1],(%rsp) + mov %rdx,$N[0] + jmp .L1st4x +.align 16 +.L1st4x: + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] mov ($ap,$j,8),%rax - mov %rax,($rp,$j,8) # rp[i]=tp[i] - mov $i,(%rsp,$j,8) # zap temporary vector + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .L1st4x + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + lea 1($i),$i # i++ +.align 4 +.Louter4x: + mov ($bp,$i,8),$m0 # m0=bp[i] + xor $j,$j # j=0 + mov (%rsp),$A[0] + mov $n0,$m1 + mulq $m0 # ap[0]*bp[i] + add %rax,$A[0] # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + imulq $A[0],$m1 # tp[0]*n0 + mov %rdx,$A[1] + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # "$N[0]", discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + add 8(%rsp),$A[1] # +tp[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] + lea 4($j),$j # j+=2 + adc \$0,%rdx + mov $N[1],(%rsp) # tp[j-1] + mov %rdx,$N[0] + jmp .Linner4x +.align 16 +.Linner4x: + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + add 8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .Linner4x + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 1($i),$i # i++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + add (%rsp,$num,8),$N[0] # pull upmost overflow bit + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + cmp $num,$i + jl .Louter4x +___ +{ +my @ri=("%rax","%rdx",$m0,$m1); +$code.=<<___; + mov 16(%rsp,$num,8),$rp # restore $rp + mov 0(%rsp),@ri[0] # tp[0] + pxor %xmm0,%xmm0 + mov 8(%rsp),@ri[1] # tp[1] + shr \$2,$num # num/=4 + lea (%rsp),$ap # borrow ap for tp + xor $i,$i # i=0 and clear CF! + + sub 0($np),@ri[0] + mov 16($ap),@ri[2] # tp[2] + mov 24($ap),@ri[3] # tp[3] + sbb 8($np),@ri[1] + lea -1($num),$j # j=num/4-1 + jmp .Lsub4x +.align 16 +.Lsub4x: + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($np,$i,8),@ri[2] + mov 32($ap,$i,8),@ri[0] # tp[i+1] + mov 40($ap,$i,8),@ri[1] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($np,$i,8),@ri[0] + mov 48($ap,$i,8),@ri[2] + mov 56($ap,$i,8),@ri[3] + sbb 40($np,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesnn't affect CF! + jnz .Lsub4x + + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 32($ap,$i,8),@ri[0] # load overflow bit + sbb 16($np,$i,8),@ri[2] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + xor $i,$i # i=0 + and @ri[0],$ap + not @ri[0] + mov $rp,$np + and @ri[0],$np + lea -1($num),$j + or $np,$ap # ap=borrow?tp:rp + + movdqu ($ap),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,($rp) + jmp .Lcopy4x +.align 16 +.Lcopy4x: # copy or in-place refresh + movdqu 16($ap,$i),%xmm2 + movdqu 32($ap,$i),%xmm1 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) + movdqa %xmm0,32(%rsp,$i) + movdqu %xmm1,32($rp,$i) + lea 32($i),$i dec $j - jge .Lcopy + jnz .Lcopy4x + shl \$2,$num + movdqu 16($ap,$i),%xmm2 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) +___ +} +$code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp mov \$1,%rax mov (%rsi),%r15 @@ -211,9 +680,823 @@ bn_mul_mont: mov 32(%rsi),%rbp mov 40(%rsi),%rbx lea 48(%rsi),%rsp -.Lepilogue: +.Lmul4x_epilogue: ret -.size bn_mul_mont,.-bn_mul_mont +.size bn_mul4x_mont,.-bn_mul4x_mont +___ +}}} + {{{ +###################################################################### +# void bn_sqr4x_mont( +my $rptr="%rdi"; # const BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # not used +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 4 and + # not less than 8 + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___; +.type bn_sqr4x_mont,\@function,6 +.align 16 +bn_sqr4x_mont: +.Lsqr4x_enter: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + shl \$3,${num}d # convert $num to bytes + xor %r10,%r10 + mov %rsp,%r11 # put aside %rsp + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) + and \$-1024,%rsp # minimize TLB usage + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +32 saved $rptr + # +40 saved $nptr + # +48 saved *n0 + # +56 saved %rsp + # +64 t[2*$num] + # + mov $rptr,32(%rsp) # save $rptr + mov $nptr,40(%rsp) + mov $n0, 48(%rsp) + mov %r11, 56(%rsp) # save original %rsp +.Lsqr4x_body: + ############################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + lea 32(%r10),$i # $i=-($num-32) + lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] + + mov $num,$j # $j=$num + + # comments apply to $num==8 case + mov -32($aptr,$i),$a0 # a[0] + lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mul $a0 # a[1]*a[0] + mov %rax,$A0[0] # a[1]*a[0] + mov $ai,%rax # a[2] + mov %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + xor $A0[0],$A0[0] + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc %rdx,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + lea -16($i),$j # j=-16 + + + mov 8($aptr,$j),$ai # a[3] + mul $a1 # a[2]*a[1] + mov %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + mov %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + lea 16($j),$j + adc \$0,$A0[1] + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr,$j) # t[3] + jmp .Lsqr4x_1st + +.align 16 +.Lsqr4x_1st: + mov ($aptr,$j),$ai # a[4] + xor $A1[0],$A1[0] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + adc %rdx,$A1[0] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + adc %rdx,$A0[0] + mov $A0[1],($tptr,$j) # t[4] + + + mov 8($aptr,$j),$ai # a[5] + xor $A1[1],$A1[1] + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],8($tptr,$j) # t[5] + + mov 16($aptr,$j),$ai # a[6] + xor $A1[0],$A1[0] + mul $a1 # a[5]*a[3] + add %rax,$A1[1] # a[5]*a[3]+t[6] + mov $ai,%rax + adc %rdx,$A1[0] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + mul $a0 # a[6]*a[2] + add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] + mov $ai,%rax # a[3] + adc %rdx,$A0[0] + mov $A0[1],16($tptr,$j) # t[6] + + + mov 24($aptr,$j),$ai # a[7] + xor $A1[1],$A1[1] + mul $a1 # a[6]*a[5] + add %rax,$A1[0] # a[6]*a[5]+t[7] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + lea 32($j),$j + adc \$0,$A0[1] + mul $a0 # a[7]*a[4] + add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr,$j) # t[7] + + cmp \$0,$j + jne .Lsqr4x_1st + + xor $A1[0],$A1[0] + add $A0[1],$A1[1] + adc \$0,$A1[0] + mul $a1 # a[7]*a[5] + add %rax,$A1[1] + adc %rdx,$A1[0] + + mov $A1[1],($tptr) # t[8] + lea 16($i),$i + mov $A1[0],8($tptr) # t[9] + jmp .Lsqr4x_outer + +.align 16 +.Lsqr4x_outer: # comments apply to $num==6 case + mov -32($aptr,$i),$a0 # a[0] + lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr,$i),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr,$i),$ai # a[2] + mov %rax,$a1 + + mov -24($tptr,$i),$A0[0] # t[1] + xor $A0[1],$A0[1] + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1] + mov $ai,%rax # a[2] + adc %rdx,$A0[1] + mov $A0[0],-24($tptr,$i) # t[1] + + xor $A0[0],$A0[0] + add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] + adc \$0,$A0[0] + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc %rdx,$A0[0] + mov $A0[1],-16($tptr,$i) # t[2] + + lea -16($i),$j # j=-16 + xor $A1[0],$A1[0] + + + mov 8($aptr,$j),$ai # a[3] + xor $A1[1],$A1[1] + add 8($tptr,$j),$A1[0] + adc \$0,$A1[1] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + adc \$0,$A0[1] + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],8($tptr,$j) # t[3] + + lea 16($j),$j + jmp .Lsqr4x_inner + +.align 16 +.Lsqr4x_inner: + mov ($aptr,$j),$ai # a[4] + xor $A1[0],$A1[0] + add ($tptr,$j),$A1[1] + adc \$0,$A1[0] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] # a[3]*a[1]+t[4] + mov $ai,%rax + adc %rdx,$A1[0] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] + adc \$0,$A0[0] + mul $a0 # a[4]*a[0] + add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] + mov $ai,%rax # a[3] + adc %rdx,$A0[0] + mov $A0[1],($tptr,$j) # t[4] + + mov 8($aptr,$j),$ai # a[5] + xor $A1[1],$A1[1] + add 8($tptr,$j),$A1[0] + adc \$0,$A1[1] + mul $a1 # a[4]*a[3] + add %rax,$A1[0] # a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A1[1] + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + lea 16($j),$j # j++ + adc \$0,$A0[1] + mul $a0 # a[5]*a[2] + add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below + + cmp \$0,$j + jne .Lsqr4x_inner + + xor $A1[0],$A1[0] + add $A0[1],$A1[1] + adc \$0,$A1[0] + mul $a1 # a[5]*a[3] + add %rax,$A1[1] + adc %rdx,$A1[0] + + mov $A1[1],($tptr) # t[6], "preloaded t[2]" below + mov $A1[0],8($tptr) # t[7], "preloaded t[3]" below + + add \$16,$i + jnz .Lsqr4x_outer + + # comments apply to $num==4 case + mov -32($aptr),$a0 # a[0] + lea 64(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] + mov -24($aptr),%rax # a[1] + lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] + mov -16($aptr),$ai # a[2] + mov %rax,$a1 + + xor $A0[1],$A0[1] + mul $a0 # a[1]*a[0] + add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] + mov $ai,%rax # a[2] + adc %rdx,$A0[1] + mov $A0[0],-24($tptr) # t[1] + + xor $A0[0],$A0[0] + add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] + adc \$0,$A0[0] + mul $a0 # a[2]*a[0] + add %rax,$A0[1] + mov $ai,%rax + adc %rdx,$A0[0] + mov $A0[1],-16($tptr) # t[2] + + mov -8($aptr),$ai # a[3] + mul $a1 # a[2]*a[1] + add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] + mov $ai,%rax + adc \$0,%rdx + + xor $A0[1],$A0[1] + add $A1[0],$A0[0] + mov %rdx,$A1[1] + adc \$0,$A0[1] + mul $a0 # a[3]*a[0] + add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] + mov $ai,%rax + adc %rdx,$A0[1] + mov $A0[0],-8($tptr) # t[3] + + xor $A1[0],$A1[0] + add $A0[1],$A1[1] + adc \$0,$A1[0] + mul $a1 # a[3]*a[1] + add %rax,$A1[1] + mov -16($aptr),%rax # a[2] + adc %rdx,$A1[0] + + mov $A1[1],($tptr) # t[4] + mov $A1[0],8($tptr) # t[5] + + mul $ai # a[2]*a[3] +___ +{ +my ($shift,$carry)=($a0,$a1); +my @S=(@A1,$ai,$n0); +$code.=<<___; + add \$16,$i + xor $shift,$shift + sub $num,$i # $i=16-$num + xor $carry,$carry + + add $A1[0],%rax # t[5] + adc \$0,%rdx + mov %rax,8($tptr) # t[5] + mov %rdx,16($tptr) # t[6] + mov $carry,24($tptr) # t[7] + + mov -16($aptr,$i),%rax # a[0] + lea 64(%rsp,$num,2),$tptr + xor $A0[0],$A0[0] # t[0] + mov -24($tptr,$i,2),$A0[1] # t[1] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr,$i,2) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],-24($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],-16($tptr,$i,2) + adc %rdx,$S[3] + lea 16($i),$i + mov $S[3],-40($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + jmp .Lsqr4x_shift_n_add + +.align 16 +.Lsqr4x_shift_n_add: + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr,$i,2) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],-24($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 0($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],-16($tptr,$i,2) + adc %rdx,$S[3] + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + mov $S[3],-8($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov 8($aptr,$i),%rax # a[i+1] # prefetch + mov $S[0],0($tptr,$i,2) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift + mov $S[1],8($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[2] + mov 16($aptr,$i),%rax # a[i+1] # prefetch + mov $S[2],16($tptr,$i,2) + adc %rdx,$S[3] + mov $S[3],24($tptr,$i,2) + sbb $carry,$carry # mov cf,$carry + add \$32,$i + jnz .Lsqr4x_shift_n_add + + lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[1] # | t[2*i]>>63 + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch + mov $A0[1],$shift # shift=t[2*i+1]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch + adc %rax,$S[0] + mov -8($aptr),%rax # a[i+1] # prefetch + mov $S[0],-32($tptr) + adc %rdx,$S[1] + + lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift + mov $S[1],-24($tptr) + sbb $carry,$carry # mov cf,$carry + shr \$63,$A0[0] + lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | + shr \$63,$A0[1] + or $A0[0],$S[3] # | t[2*i]>>63 + mul %rax # a[i]*a[i] + neg $carry # mov $carry,cf + adc %rax,$S[2] + adc %rdx,$S[3] + mov $S[2],-16($tptr) + mov $S[3],-8($tptr) +___ +} +############################################################## +# Montgomery reduction part, "word-by-word" algorithm. +# +{ +my ($topbit,$nptr)=("%rbp",$aptr); +my ($m0,$m1)=($a0,$a1); +my @Ni=("%rbx","%r9"); +$code.=<<___; + mov 40(%rsp),$nptr # restore $nptr + mov 48(%rsp),$n0 # restore *n0 + xor $j,$j + mov $num,0(%rsp) # save $num + sub $num,$j # $j=-$num + mov 64(%rsp),$A0[0] # t[0] # modsched # + mov $n0,$m0 # # modsched # + lea 64(%rsp,$num,2),%rax # end of t[] buffer + lea 64(%rsp,$num),$tptr # end of t[] window + mov %rax,8(%rsp) # save end of t[] buffer + lea ($nptr,$num),$nptr # end of n[] buffer + xor $topbit,$topbit # $topbit=0 + + mov 0($nptr,$j),%rax # n[0] # modsched # + mov 8($nptr,$j),$Ni[1] # n[1] # modsched # + imulq $A0[0],$m0 # m0=t[0]*n0 # modsched # + mov %rax,$Ni[0] # # modsched # + jmp .Lsqr4x_mont_outer + +.align 16 +.Lsqr4x_mont_outer: + xor $A0[1],$A0[1] + mul $m0 # n[0]*m0 + add %rax,$A0[0] # n[0]*m0+t[0] + mov $Ni[1],%rax + adc %rdx,$A0[1] + mov $n0,$m1 + + xor $A0[0],$A0[0] + add 8($tptr,$j),$A0[1] + adc \$0,$A0[0] + mul $m0 # n[1]*m0 + add %rax,$A0[1] # n[1]*m0+t[1] + mov $Ni[0],%rax + adc %rdx,$A0[0] + + imulq $A0[1],$m1 + + mov 16($nptr,$j),$Ni[0] # n[2] + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[0]*m1 + add %rax,$A1[0] # n[0]*m1+"t[1]" + mov $Ni[0],%rax + adc %rdx,$A1[1] + mov $A1[0],8($tptr,$j) # "t[1]" + + xor $A0[1],$A0[1] + add 16($tptr,$j),$A0[0] + adc \$0,$A0[1] + mul $m0 # n[2]*m0 + add %rax,$A0[0] # n[2]*m0+t[2] + mov $Ni[1],%rax + adc %rdx,$A0[1] + + mov 24($nptr,$j),$Ni[1] # n[3] + xor $A1[0],$A1[0] + add $A0[0],$A1[1] + adc \$0,$A1[0] + mul $m1 # n[1]*m1 + add %rax,$A1[1] # n[1]*m1+"t[2]" + mov $Ni[1],%rax + adc %rdx,$A1[0] + mov $A1[1],16($tptr,$j) # "t[2]" + + xor $A0[0],$A0[0] + add 24($tptr,$j),$A0[1] + lea 32($j),$j + adc \$0,$A0[0] + mul $m0 # n[3]*m0 + add %rax,$A0[1] # n[3]*m0+t[3] + mov $Ni[0],%rax + adc %rdx,$A0[0] + jmp .Lsqr4x_mont_inner + +.align 16 +.Lsqr4x_mont_inner: + mov ($nptr,$j),$Ni[0] # n[4] + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[2]*m1 + add %rax,$A1[0] # n[2]*m1+"t[3]" + mov $Ni[0],%rax + adc %rdx,$A1[1] + mov $A1[0],-8($tptr,$j) # "t[3]" + + xor $A0[1],$A0[1] + add ($tptr,$j),$A0[0] + adc \$0,$A0[1] + mul $m0 # n[4]*m0 + add %rax,$A0[0] # n[4]*m0+t[4] + mov $Ni[1],%rax + adc %rdx,$A0[1] + + mov 8($nptr,$j),$Ni[1] # n[5] + xor $A1[0],$A1[0] + add $A0[0],$A1[1] + adc \$0,$A1[0] + mul $m1 # n[3]*m1 + add %rax,$A1[1] # n[3]*m1+"t[4]" + mov $Ni[1],%rax + adc %rdx,$A1[0] + mov $A1[1],($tptr,$j) # "t[4]" + + xor $A0[0],$A0[0] + add 8($tptr,$j),$A0[1] + adc \$0,$A0[0] + mul $m0 # n[5]*m0 + add %rax,$A0[1] # n[5]*m0+t[5] + mov $Ni[0],%rax + adc %rdx,$A0[0] + + + mov 16($nptr,$j),$Ni[0] # n[6] + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[4]*m1 + add %rax,$A1[0] # n[4]*m1+"t[5]" + mov $Ni[0],%rax + adc %rdx,$A1[1] + mov $A1[0],8($tptr,$j) # "t[5]" + + xor $A0[1],$A0[1] + add 16($tptr,$j),$A0[0] + adc \$0,$A0[1] + mul $m0 # n[6]*m0 + add %rax,$A0[0] # n[6]*m0+t[6] + mov $Ni[1],%rax + adc %rdx,$A0[1] + + mov 24($nptr,$j),$Ni[1] # n[7] + xor $A1[0],$A1[0] + add $A0[0],$A1[1] + adc \$0,$A1[0] + mul $m1 # n[5]*m1 + add %rax,$A1[1] # n[5]*m1+"t[6]" + mov $Ni[1],%rax + adc %rdx,$A1[0] + mov $A1[1],16($tptr,$j) # "t[6]" + + xor $A0[0],$A0[0] + add 24($tptr,$j),$A0[1] + lea 32($j),$j + adc \$0,$A0[0] + mul $m0 # n[7]*m0 + add %rax,$A0[1] # n[7]*m0+t[7] + mov $Ni[0],%rax + adc %rdx,$A0[0] + cmp \$0,$j + jne .Lsqr4x_mont_inner + + sub 0(%rsp),$j # $j=-$num # modsched # + mov $n0,$m0 # # modsched # + + xor $A1[1],$A1[1] + add $A0[1],$A1[0] + adc \$0,$A1[1] + mul $m1 # n[6]*m1 + add %rax,$A1[0] # n[6]*m1+"t[7]" + mov $Ni[1],%rax + adc %rdx,$A1[1] + mov $A1[0],-8($tptr) # "t[7]" + + xor $A0[1],$A0[1] + add ($tptr),$A0[0] # +t[8] + adc \$0,$A0[1] + mov 0($nptr,$j),$Ni[0] # n[0] # modsched # + add $topbit,$A0[0] + adc \$0,$A0[1] + + imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched # + xor $A1[0],$A1[0] + mov 8($nptr,$j),$Ni[1] # n[1] # modsched # + add $A0[0],$A1[1] + mov 16($tptr,$j),$A0[0] # t[0] # modsched # + adc \$0,$A1[0] + mul $m1 # n[7]*m1 + add %rax,$A1[1] # n[7]*m1+"t[8]" + mov $Ni[0],%rax # # modsched # + adc %rdx,$A1[0] + mov $A1[1],($tptr) # "t[8]" + + xor $topbit,$topbit + add 8($tptr),$A1[0] # +t[9] + adc $topbit,$topbit + add $A0[1],$A1[0] + lea 16($tptr),$tptr # "t[$num]>>128" + adc \$0,$topbit + mov $A1[0],-8($tptr) # "t[9]" + cmp 8(%rsp),$tptr # are we done? + jb .Lsqr4x_mont_outer + + mov 0(%rsp),$num # restore $num + mov $topbit,($tptr) # save $topbit +___ +} +############################################################## +# Post-condition, 4x unrolled copy from bn_mul_mont +# +{ +my ($tptr,$nptr)=("%rbx",$aptr); +my @ri=("%rax","%rdx","%r10","%r11"); +$code.=<<___; + mov 64(%rsp,$num),@ri[0] # tp[0] + lea 64(%rsp,$num),$tptr # upper half of t[2*$num] holds result + mov 40(%rsp),$nptr # restore $nptr + shr \$5,$num # num/4 + mov 8($tptr),@ri[1] # t[1] + xor $i,$i # i=0 and clear CF! + + mov 32(%rsp),$rptr # restore $rptr + sub 0($nptr),@ri[0] + mov 16($tptr),@ri[2] # t[2] + mov 24($tptr),@ri[3] # t[3] + sbb 8($nptr),@ri[1] + lea -1($num),$j # j=num/4-1 + jmp .Lsqr4x_sub +.align 16 +.Lsqr4x_sub: + mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($nptr,$i,8),@ri[2] + mov 32($tptr,$i,8),@ri[0] # tp[i+1] + mov 40($tptr,$i,8),@ri[1] + sbb 24($nptr,$i,8),@ri[3] + mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($nptr,$i,8),@ri[0] + mov 48($tptr,$i,8),@ri[2] + mov 56($tptr,$i,8),@ri[3] + sbb 40($nptr,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesn't affect CF! + jnz .Lsqr4x_sub + + mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov 32($tptr,$i,8),@ri[0] # load overflow bit + sbb 16($nptr,$i,8),@ri[2] + mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($nptr,$i,8),@ri[3] + mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] + xor $i,$i # i=0 + and @ri[0],$tptr + not @ri[0] + mov $rptr,$nptr + and @ri[0],$nptr + lea -1($num),$j + or $nptr,$tptr # tp=borrow?tp:rp + + pxor %xmm0,%xmm0 + lea 64(%rsp,$num,8),$nptr + movdqu ($tptr),%xmm1 + lea ($nptr,$num,8),$nptr + movdqa %xmm0,64(%rsp) # zap lower half of temporary vector + movdqa %xmm0,($nptr) # zap upper half of temporary vector + movdqu %xmm1,($rptr) + jmp .Lsqr4x_copy +.align 16 +.Lsqr4x_copy: # copy or in-place refresh + movdqu 16($tptr,$i),%xmm2 + movdqu 32($tptr,$i),%xmm1 + movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector + movdqa %xmm0,96(%rsp,$i) # zap lower half of temporary vector + movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector + movdqa %xmm0,32($nptr,$i) # zap upper half of temporary vector + movdqu %xmm2,16($rptr,$i) + movdqu %xmm1,32($rptr,$i) + lea 32($i),$i + dec $j + jnz .Lsqr4x_copy + + movdqu 16($tptr,$i),%xmm2 + movdqa %xmm0,80(%rsp,$i) # zap lower half of temporary vector + movdqa %xmm0,16($nptr,$i) # zap upper half of temporary vector + movdqu %xmm2,16($rptr,$i) +___ +} +$code.=<<___; + mov 56(%rsp),%rsi # restore %rsp + mov \$1,%rax + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lsqr4x_epilogue: + ret +.size bn_sqr4x_mont,.-bn_sqr4x_mont +___ +}}} +$code.=<<___; .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " .align 16 ___ @@ -228,9 +1511,9 @@ $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind -.type se_handler,\@abi-omnipotent +.type mul_handler,\@abi-omnipotent .align 16 -se_handler: +mul_handler: push %rsi push %rdi push %rbx @@ -245,15 +1528,20 @@ se_handler: mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip - lea .Lprologue(%rip),%r10 - cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lin_prologue + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRsp - lea .Lepilogue(%rip),%r10 - cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lin_prologue + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer @@ -272,7 +1560,53 @@ se_handler: mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 -.Lin_prologue: + jmp .Lcommon_seh_tail +.size mul_handler,.-mul_handler + +.type sqr_handler,\@abi-omnipotent +.align 16 +sqr_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lsqr4x_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lsqr_body + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lsqr4x_epilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue + jae .Lcommon_seh_tail + + mov 56(%rax),%rax # pull saved stack pointer + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp @@ -310,7 +1644,7 @@ se_handler: pop %rdi pop %rsi ret -.size se_handler,.-se_handler +.size sqr_handler,.-sqr_handler .section .pdata .align 4 @@ -318,11 +1652,27 @@ se_handler: .rva .LSEH_end_bn_mul_mont .rva .LSEH_info_bn_mul_mont + .rva .LSEH_begin_bn_mul4x_mont + .rva .LSEH_end_bn_mul4x_mont + .rva .LSEH_info_bn_mul4x_mont + + .rva .LSEH_begin_bn_sqr4x_mont + .rva .LSEH_end_bn_sqr4x_mont + .rva .LSEH_info_bn_sqr4x_mont + .section .xdata .align 8 .LSEH_info_bn_mul_mont: .byte 9,0,0,0 - .rva se_handler + .rva mul_handler + .rva .Lmul_body,.Lmul_epilogue # HandlerData[] +.LSEH_info_bn_mul4x_mont: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.LSEH_info_bn_sqr4x_mont: + .byte 9,0,0,0 + .rva sqr_handler ___ } diff --git a/src/lib/libcrypto/bn/asm/x86_64-mont5.pl b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl new file mode 100755 index 0000000000..057cda28aa --- /dev/null +++ b/src/lib/libcrypto/bn/asm/x86_64-mont5.pl @@ -0,0 +1,1070 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# August 2011. +# +# Companion to x86_64-mont.pl that optimizes cache-timing attack +# countermeasures. The subroutines are produced by replacing bp[i] +# references in their x86_64-mont.pl counterparts with cache-neutral +# references to powers table computed in BN_mod_exp_mont_consttime. +# In addition subroutine that scatters elements of the powers table +# is implemented, so that scatter-/gathering can be tuned without +# bn_exp.c modifications. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +# int bn_mul_mont_gather5( +$rp="%rdi"; # BN_ULONG *rp, +$ap="%rsi"; # const BN_ULONG *ap, +$bp="%rdx"; # const BN_ULONG *bp, +$np="%rcx"; # const BN_ULONG *np, +$n0="%r8"; # const BN_ULONG *n0, +$num="%r9"; # int num, + # int idx); # 0 to 2^5-1, "index" in $bp holding + # pre-computed powers of a', interlaced + # in such manner that b[0] is $bp[idx], + # b[1] is [2^5+idx], etc. +$lo0="%r10"; +$hi0="%r11"; +$hi1="%r13"; +$i="%r14"; +$j="%r15"; +$m0="%rbx"; +$m1="%rbp"; + +$code=<<___; +.text + +.globl bn_mul_mont_gather5 +.type bn_mul_mont_gather5,\@function,6 +.align 64 +bn_mul_mont_gather5: + test \$3,${num}d + jnz .Lmul_enter + cmp \$8,${num}d + jb .Lmul_enter + jmp .Lmul4x_enter + +.align 16 +.Lmul_enter: + mov ${num}d,${num}d + mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +.Lmul_alloca: +___ +$code.=<<___; + mov %rsp,%rax + lea 2($num),%r11 + neg %r11 + lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) + and \$-1024,%rsp # minimize TLB usage + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul_body: + mov $bp,%r12 # reassign $bp +___ + $bp="%r12"; + $STRIDE=2**5*8; # 5 is "window size" + $N=$STRIDE/4; # should match cache line size +$code.=<<___; + mov %r10,%r11 + shr \$`log($N/8)/log(2)`,%r10 + and \$`$N/8-1`,%r11 + not %r10 + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" + lea 96($bp,%r11,8),$bp # pointer within 1st cache line + movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which + movq 8(%rax,%r10,8),%xmm5 # cache line contains element + movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument + movq 24(%rax,%r10,8),%xmm7 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + movq %xmm0,$m0 # m0=bp[0] + + mov ($n0),$n0 # pull n0[0] value + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$lo0 + mov ($np),%rax + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $lo0,$m1 # "tp[0]"*n0 + mov %rdx,$hi0 + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$hi1 + + lea 1($j),$j # j++ + jmp .L1st_enter + +.align 16 +.L1st: + add %rax,$hi1 + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + mov $lo0,$hi0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.L1st_enter: + mulq $m0 # ap[j]*bp[0] + add %rax,$hi0 + mov ($np,$j,8),%rax + adc \$0,%rdx + lea 1($j),$j # j++ + mov %rdx,$lo0 + + mulq $m1 # np[j]*m1 + cmp $num,$j + jne .L1st + + movq %xmm0,$m0 # bp[1] + + add %rax,$hi1 + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + mov $lo0,$hi0 + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + jmp .Louter +.align 16 +.Louter: + xor $j,$j # j=0 + mov $n0,$m1 + mov (%rsp),$lo0 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mulq $m0 # ap[0]*bp[i] + add %rax,$lo0 # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $lo0,$m1 # tp[0]*n0 + mov %rdx,$hi0 + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$lo0 # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov 8(%rsp),$lo0 # tp[1] + mov %rdx,$hi1 + + lea 1($j),$j # j++ + jmp .Linner_enter + +.align 16 +.Linner: + add %rax,$hi1 + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + +.Linner_enter: + mulq $m0 # ap[j]*bp[i] + add %rax,$hi0 + mov ($np,$j,8),%rax + adc \$0,%rdx + add $hi0,$lo0 # ap[j]*bp[i]+tp[j] + mov %rdx,$hi0 + adc \$0,$hi0 + lea 1($j),$j # j++ + + mulq $m1 # np[j]*m1 + cmp $num,$j + jne .Linner + + movq %xmm0,$m0 # bp[i+1] + + add %rax,$hi1 + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] + mov (%rsp,$j,8),$lo0 + adc \$0,%rdx + mov $hi1,-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$hi1 + + xor %rdx,%rdx + add $hi0,$hi1 + adc \$0,%rdx + add $lo0,$hi1 # pull upmost overflow bit + adc \$0,%rdx + mov $hi1,-8(%rsp,$num,8) + mov %rdx,(%rsp,$num,8) # store upmost overflow bit + + lea 1($i),$i # i++ + cmp $num,$i + jl .Louter + + xor $i,$i # i=0 and clear CF! + mov (%rsp),%rax # tp[0] + lea (%rsp),$ap # borrow ap for tp + mov $num,$j # j=num + jmp .Lsub +.align 16 +.Lsub: sbb ($np,$i,8),%rax + mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 8($ap,$i,8),%rax # tp[i+1] + lea 1($i),$i # i++ + dec $j # doesnn't affect CF! + jnz .Lsub + + sbb \$0,%rax # handle upmost overflow bit + xor $i,$i + and %rax,$ap + not %rax + mov $rp,$np + and %rax,$np + mov $num,$j # j=num + or $np,$ap # ap=borrow?tp:rp +.align 16 +.Lcopy: # copy or in-place refresh + mov ($ap,$i,8),%rax + mov $i,(%rsp,$i,8) # zap temporary vector + mov %rax,($rp,$i,8) # rp[i]=tp[i] + lea 1($i),$i + sub \$1,$j + jnz .Lcopy + + mov 8(%rsp,$num,8),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps (%rsi),%xmm6 + movaps 0x10(%rsi),%xmm7 + lea 0x28(%rsi),%rsi +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmul_epilogue: + ret +.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 +___ +{{{ +my @A=("%r10","%r11"); +my @N=("%r13","%rdi"); +$code.=<<___; +.type bn_mul4x_mont_gather5,\@function,6 +.align 16 +bn_mul4x_mont_gather5: +.Lmul4x_enter: + mov ${num}d,${num}d + mov `($win64?56:8)`(%rsp),%r10d # load 7th argument + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +.Lmul4x_alloca: +___ +$code.=<<___; + mov %rsp,%rax + lea 4($num),%r11 + neg %r11 + lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+4)) + and \$-1024,%rsp # minimize TLB usage + + mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp +.Lmul4x_body: + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp + mov %rdx,%r12 # reassign $bp +___ + $bp="%r12"; + $STRIDE=2**5*8; # 5 is "window size" + $N=$STRIDE/4; # should match cache line size +$code.=<<___; + mov %r10,%r11 + shr \$`log($N/8)/log(2)`,%r10 + and \$`$N/8-1`,%r11 + not %r10 + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" + lea 96($bp,%r11,8),$bp # pointer within 1st cache line + movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which + movq 8(%rax,%r10,8),%xmm5 # cache line contains element + movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument + movq 24(%rax,%r10,8),%xmm7 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + movq %xmm0,$m0 # m0=bp[0] + mov ($n0),$n0 # pull n0[0] value + mov ($ap),%rax + + xor $i,$i # i=0 + xor $j,$j # j=0 + + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mov $n0,$m1 + mulq $m0 # ap[0]*bp[0] + mov %rax,$A[0] + mov ($np),%rax + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $A[0],$m1 # "tp[0]"*n0 + mov %rdx,$A[1] + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] + lea 4($j),$j # j++ + adc \$0,%rdx + mov $N[1],(%rsp) + mov %rdx,$N[0] + jmp .L1st4x +.align 16 +.L1st4x: + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-8(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .L1st4x + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[0] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + movq %xmm0,$m0 # bp[1] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + lea 1($i),$i # i++ +.align 4 +.Louter4x: + xor $j,$j # j=0 + movq `0*$STRIDE/4-96`($bp),%xmm0 + movq `1*$STRIDE/4-96`($bp),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bp),%xmm2 + pand %xmm5,%xmm1 + + mov (%rsp),$A[0] + mov $n0,$m1 + mulq $m0 # ap[0]*bp[i] + add %rax,$A[0] # ap[0]*bp[i]+tp[0] + mov ($np),%rax + adc \$0,%rdx + + movq `3*$STRIDE/4-96`($bp),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + imulq $A[0],$m1 # tp[0]*n0 + mov %rdx,$A[1] + + por %xmm2,%xmm0 + lea $STRIDE($bp),$bp + por %xmm3,%xmm0 + + mulq $m1 # np[0]*m1 + add %rax,$A[0] # "$N[0]", discarded + mov 8($ap),%rax + adc \$0,%rdx + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np),%rax + adc \$0,%rdx + add 8(%rsp),$A[1] # +tp[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov 16($ap),%rax + adc \$0,%rdx + add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] + lea 4($j),$j # j+=2 + adc \$0,%rdx + mov %rdx,$N[0] + jmp .Linner4x +.align 16 +.Linner4x: + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov ($np,$j,8),%rax + adc \$0,%rdx + add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov 8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov 8($np,$j,8),%rax + adc \$0,%rdx + add 8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 4($j),$j # j++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov -16($ap,$j,8),%rax + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-40(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + cmp $num,$j + jl .Linner4x + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[0] + mov -16($np,$j,8),%rax + adc \$0,%rdx + add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] + adc \$0,%rdx + mov %rdx,$A[1] + + mulq $m1 # np[j]*m1 + add %rax,$N[0] + mov -8($ap,$j,8),%rax + adc \$0,%rdx + add $A[0],$N[0] + adc \$0,%rdx + mov $N[1],-32(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[1] + + mulq $m0 # ap[j]*bp[i] + add %rax,$A[1] + mov -8($np,$j,8),%rax + adc \$0,%rdx + add -8(%rsp,$j,8),$A[1] + adc \$0,%rdx + lea 1($i),$i # i++ + mov %rdx,$A[0] + + mulq $m1 # np[j]*m1 + add %rax,$N[1] + mov ($ap),%rax # ap[0] + adc \$0,%rdx + add $A[1],$N[1] + adc \$0,%rdx + mov $N[0],-24(%rsp,$j,8) # tp[j-1] + mov %rdx,$N[0] + + movq %xmm0,$m0 # bp[i+1] + mov $N[1],-16(%rsp,$j,8) # tp[j-1] + + xor $N[1],$N[1] + add $A[0],$N[0] + adc \$0,$N[1] + add (%rsp,$num,8),$N[0] # pull upmost overflow bit + adc \$0,$N[1] + mov $N[0],-8(%rsp,$j,8) + mov $N[1],(%rsp,$j,8) # store upmost overflow bit + + cmp $num,$i + jl .Louter4x +___ +{ +my @ri=("%rax","%rdx",$m0,$m1); +$code.=<<___; + mov 16(%rsp,$num,8),$rp # restore $rp + mov 0(%rsp),@ri[0] # tp[0] + pxor %xmm0,%xmm0 + mov 8(%rsp),@ri[1] # tp[1] + shr \$2,$num # num/=4 + lea (%rsp),$ap # borrow ap for tp + xor $i,$i # i=0 and clear CF! + + sub 0($np),@ri[0] + mov 16($ap),@ri[2] # tp[2] + mov 24($ap),@ri[3] # tp[3] + sbb 8($np),@ri[1] + lea -1($num),$j # j=num/4-1 + jmp .Lsub4x +.align 16 +.Lsub4x: + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 16($np,$i,8),@ri[2] + mov 32($ap,$i,8),@ri[0] # tp[i+1] + mov 40($ap,$i,8),@ri[1] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 32($np,$i,8),@ri[0] + mov 48($ap,$i,8),@ri[2] + mov 56($ap,$i,8),@ri[3] + sbb 40($np,$i,8),@ri[1] + lea 4($i),$i # i++ + dec $j # doesnn't affect CF! + jnz .Lsub4x + + mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] + mov 32($ap,$i,8),@ri[0] # load overflow bit + sbb 16($np,$i,8),@ri[2] + mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] + sbb 24($np,$i,8),@ri[3] + mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] + + sbb \$0,@ri[0] # handle upmost overflow bit + mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] + xor $i,$i # i=0 + and @ri[0],$ap + not @ri[0] + mov $rp,$np + and @ri[0],$np + lea -1($num),$j + or $np,$ap # ap=borrow?tp:rp + + movdqu ($ap),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,($rp) + jmp .Lcopy4x +.align 16 +.Lcopy4x: # copy or in-place refresh + movdqu 16($ap,$i),%xmm2 + movdqu 32($ap,$i),%xmm1 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) + movdqa %xmm0,32(%rsp,$i) + movdqu %xmm1,32($rp,$i) + lea 32($i),$i + dec $j + jnz .Lcopy4x + + shl \$2,$num + movdqu 16($ap,$i),%xmm2 + movdqa %xmm0,16(%rsp,$i) + movdqu %xmm2,16($rp,$i) +___ +} +$code.=<<___; + mov 8(%rsp,$num,8),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps (%rsi),%xmm6 + movaps 0x10(%rsi),%xmm7 + lea 0x28(%rsi),%rsi +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmul4x_epilogue: + ret +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 +___ +}}} + +{ +my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +my $out=$inp; +my $STRIDE=2**5*8; +my $N=$STRIDE/4; + +$code.=<<___; +.globl bn_scatter5 +.type bn_scatter5,\@abi-omnipotent +.align 16 +bn_scatter5: + cmp \$0, $num + jz .Lscatter_epilogue + lea ($tbl,$idx,8),$tbl +.Lscatter: + mov ($inp),%rax + lea 8($inp),$inp + mov %rax,($tbl) + lea 32*8($tbl),$tbl + sub \$1,$num + jnz .Lscatter +.Lscatter_epilogue: + ret +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.type bn_gather5,\@abi-omnipotent +.align 16 +bn_gather5: +___ +$code.=<<___ if ($win64); +.LSEH_begin_bn_gather5: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) +___ +$code.=<<___; + mov $idx,%r11 + shr \$`log($N/8)/log(2)`,$idx + and \$`$N/8-1`,%r11 + not $idx + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,$idx # 5 is "window size" + lea 96($tbl,%r11,8),$tbl # pointer within 1st cache line + movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which + movq 8(%rax,$idx,8),%xmm5 # cache line contains element + movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument + movq 24(%rax,$idx,8),%xmm7 + jmp .Lgather +.align 16 +.Lgather: + movq `0*$STRIDE/4-96`($tbl),%xmm0 + movq `1*$STRIDE/4-96`($tbl),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($tbl),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($tbl),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($tbl),$tbl + por %xmm3,%xmm0 + + movq %xmm0,($out) # m0=bp[0] + lea 8($out),$out + sub \$1,$num + jnz .Lgather +___ +$code.=<<___ if ($win64); + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) + lea 0x28(%rsp),%rsp +___ +$code.=<<___; + ret +.LSEH_end_bn_gather5: +.size bn_gather5,.-bn_gather5 +___ +} +$code.=<<___; +.align 64 +.Lmagic_masks: + .long 0,0, 0,0, 0,0, -1,-1 + .long 0,0, 0,0, 0,0, 0,0 +.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type mul_handler,\@abi-omnipotent +.align 16 +mul_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label + cmp %r10,%rbx # context->RipRipRsp + + mov 8(%r11),%r10d # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 192($context),%r10 # pull $num + mov 8(%rax,%r10,8),%rax # pull saved stack pointer + + movaps (%rax),%xmm0 + movaps 16(%rax),%xmm1 + lea `40+48`(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + movups %xmm0,512($context) # restore context->Xmm6 + movups %xmm1,528($context) # restore context->Xmm7 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size mul_handler,.-mul_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_bn_mul_mont_gather5 + .rva .LSEH_end_bn_mul_mont_gather5 + .rva .LSEH_info_bn_mul_mont_gather5 + + .rva .LSEH_begin_bn_mul4x_mont_gather5 + .rva .LSEH_end_bn_mul4x_mont_gather5 + .rva .LSEH_info_bn_mul4x_mont_gather5 + + .rva .LSEH_begin_bn_gather5 + .rva .LSEH_end_bn_gather5 + .rva .LSEH_info_bn_gather5 + +.section .xdata +.align 8 +.LSEH_info_bn_mul_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul_alloca,.Lmul_body,.Lmul_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_mul4x_mont_gather5: + .byte 9,0,0,0 + .rva mul_handler + .rva .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] +.align 8 +.LSEH_info_bn_gather5: + .byte 0x01,0x0d,0x05,0x00 + .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 + .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28 +.align 8 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/bn/bn.h b/src/lib/libcrypto/bn/bn.h index a0bc47837d..f34248ec4f 100644 --- a/src/lib/libcrypto/bn/bn.h +++ b/src/lib/libcrypto/bn/bn.h @@ -558,6 +558,17 @@ int BN_is_prime_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, BN_GENCB *cb); int BN_is_prime_fasttest_ex(const BIGNUM *p,int nchecks, BN_CTX *ctx, int do_trial_division, BN_GENCB *cb); +int BN_X931_generate_Xpq(BIGNUM *Xp, BIGNUM *Xq, int nbits, BN_CTX *ctx); + +int BN_X931_derive_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2, + const BIGNUM *Xp, const BIGNUM *Xp1, const BIGNUM *Xp2, + const BIGNUM *e, BN_CTX *ctx, BN_GENCB *cb); +int BN_X931_generate_prime_ex(BIGNUM *p, BIGNUM *p1, BIGNUM *p2, + BIGNUM *Xp1, BIGNUM *Xp2, + const BIGNUM *Xp, + const BIGNUM *e, BN_CTX *ctx, + BN_GENCB *cb); + BN_MONT_CTX *BN_MONT_CTX_new(void ); void BN_MONT_CTX_init(BN_MONT_CTX *ctx); int BN_mod_mul_montgomery(BIGNUM *r,const BIGNUM *a,const BIGNUM *b, @@ -612,6 +623,8 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, BN_RECP_CTX *recp, BN_CTX *ctx); +#ifndef OPENSSL_NO_EC2M + /* Functions for arithmetic over binary polynomials represented by BIGNUMs. * * The BIGNUM::neg property of BIGNUMs representing binary polynomials is @@ -663,6 +676,8 @@ int BN_GF2m_mod_solve_quad_arr(BIGNUM *r, const BIGNUM *a, int BN_GF2m_poly2arr(const BIGNUM *a, int p[], int max); int BN_GF2m_arr2poly(const int p[], BIGNUM *a); +#endif + /* faster mod functions for the 'NIST primes' * 0 <= a < p^2 */ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx); diff --git a/src/lib/libcrypto/bn/bn_div.c b/src/lib/libcrypto/bn/bn_div.c index 802a43d642..52b3304293 100644 --- a/src/lib/libcrypto/bn/bn_div.c +++ b/src/lib/libcrypto/bn/bn_div.c @@ -169,15 +169,13 @@ int BN_div(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, const BIGNUM *d, #endif /* OPENSSL_NO_ASM */ -/* BN_div[_no_branch] computes dv := num / divisor, rounding towards +/* BN_div computes dv := num / divisor, rounding towards * zero, and sets up rm such that dv*divisor + rm = num holds. * Thus: * dv->neg == num->neg ^ divisor->neg (unless the result is zero) * rm->neg == num->neg (unless the remainder is zero) * If 'dv' or 'rm' is NULL, the respective value is not returned. */ -static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, - const BIGNUM *divisor, BN_CTX *ctx); int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_CTX *ctx) { @@ -186,6 +184,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, BN_ULONG *resp,*wnump; BN_ULONG d0,d1; int num_n,div_n; + int no_branch=0; /* Invalid zero-padding would have particularly bad consequences * in the case of 'num', so don't just rely on bn_check_top() for this one @@ -200,7 +199,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, if ((BN_get_flags(num, BN_FLG_CONSTTIME) != 0) || (BN_get_flags(divisor, BN_FLG_CONSTTIME) != 0)) { - return BN_div_no_branch(dv, rm, num, divisor, ctx); + no_branch=1; } bn_check_top(dv); @@ -214,7 +213,7 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, return(0); } - if (BN_ucmp(num,divisor) < 0) + if (!no_branch && BN_ucmp(num,divisor) < 0) { if (rm != NULL) { if (BN_copy(rm,num) == NULL) return(0); } @@ -239,242 +238,25 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor, norm_shift+=BN_BITS2; if (!(BN_lshift(snum,num,norm_shift))) goto err; snum->neg=0; - div_n=sdiv->top; - num_n=snum->top; - loop=num_n-div_n; - /* Lets setup a 'window' into snum - * This is the part that corresponds to the current - * 'area' being divided */ - wnum.neg = 0; - wnum.d = &(snum->d[loop]); - wnum.top = div_n; - /* only needed when BN_ucmp messes up the values between top and max */ - wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */ - - /* Get the top 2 words of sdiv */ - /* div_n=sdiv->top; */ - d0=sdiv->d[div_n-1]; - d1=(div_n == 1)?0:sdiv->d[div_n-2]; - - /* pointer to the 'top' of snum */ - wnump= &(snum->d[num_n-1]); - - /* Setup to 'res' */ - res->neg= (num->neg^divisor->neg); - if (!bn_wexpand(res,(loop+1))) goto err; - res->top=loop; - resp= &(res->d[loop-1]); - - /* space for temp */ - if (!bn_wexpand(tmp,(div_n+1))) goto err; - if (BN_ucmp(&wnum,sdiv) >= 0) + if (no_branch) { - /* If BN_DEBUG_RAND is defined BN_ucmp changes (via - * bn_pollute) the const bignum arguments => - * clean the values between top and max again */ - bn_clear_top2max(&wnum); - bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); - *resp=1; - } - else - res->top--; - /* if res->top == 0 then clear the neg value otherwise decrease - * the resp pointer */ - if (res->top == 0) - res->neg = 0; - else - resp--; - - for (i=0; i 0x%08X\n", - n0, n1, d0, q); -#endif -#endif - -#ifndef REMAINDER_IS_ALREADY_CALCULATED - /* - * rem doesn't have to be BN_ULLONG. The least we - * know it's less that d0, isn't it? - */ - rem=(n1-q*d0)&BN_MASK2; -#endif - t2=(BN_ULLONG)d1*q; - - for (;;) - { - if (t2 <= ((((BN_ULLONG)rem)< 0x%08X\n", - n0, n1, d0, q); -#endif -#ifndef REMAINDER_IS_ALREADY_CALCULATED - rem=(n1-q*d0)&BN_MASK2; -#endif - -#if defined(BN_UMULT_LOHI) - BN_UMULT_LOHI(t2l,t2h,d1,q); -#elif defined(BN_UMULT_HIGH) - t2l = d1 * q; - t2h = BN_UMULT_HIGH(d1,q); -#else + /* Since we don't know whether snum is larger than sdiv, + * we pad snum with enough zeroes without changing its + * value. + */ + if (snum->top <= sdiv->top+1) { - BN_ULONG ql, qh; - t2l=LBITS(d1); t2h=HBITS(d1); - ql =LBITS(q); qh =HBITS(q); - mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */ + if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err; + for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0; + snum->top = sdiv->top + 2; } -#endif - - for (;;) - { - if ((t2h < rem) || - ((t2h == rem) && (t2l <= wnump[-2]))) - break; - q--; - rem += d0; - if (rem < d0) break; /* don't let rem overflow */ - if (t2l < d1) t2h--; t2l -= d1; - } -#endif /* !BN_LLONG */ - } -#endif /* !BN_DIV3W */ - - l0=bn_mul_words(tmp->d,sdiv->d,div_n,q); - tmp->d[div_n]=l0; - wnum.d--; - /* ingore top values of the bignums just sub the two - * BN_ULONG arrays with bn_sub_words */ - if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n+1)) + else { - /* Note: As we have considered only the leading - * two BN_ULONGs in the calculation of q, sdiv * q - * might be greater than wnum (but then (q-1) * sdiv - * is less or equal than wnum) - */ - q--; - if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) - /* we can't have an overflow here (assuming - * that q != 0, but if q == 0 then tmp is - * zero anyway) */ - (*wnump)++; + if (bn_wexpand(snum, snum->top + 1) == NULL) goto err; + snum->d[snum->top] = 0; + snum->top ++; } - /* store part of the result */ - *resp = q; - } - bn_correct_top(snum); - if (rm != NULL) - { - /* Keep a copy of the neg flag in num because if rm==num - * BN_rshift() will overwrite it. - */ - int neg = num->neg; - BN_rshift(rm,snum,norm_shift); - if (!BN_is_zero(rm)) - rm->neg = neg; - bn_check_top(rm); - } - BN_CTX_end(ctx); - return(1); -err: - bn_check_top(rm); - BN_CTX_end(ctx); - return(0); - } - - -/* BN_div_no_branch is a special version of BN_div. It does not contain - * branches that may leak sensitive information. - */ -static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, - const BIGNUM *divisor, BN_CTX *ctx) - { - int norm_shift,i,loop; - BIGNUM *tmp,wnum,*snum,*sdiv,*res; - BN_ULONG *resp,*wnump; - BN_ULONG d0,d1; - int num_n,div_n; - - bn_check_top(dv); - bn_check_top(rm); - /* bn_check_top(num); */ /* 'num' has been checked in BN_div() */ - bn_check_top(divisor); - - if (BN_is_zero(divisor)) - { - BNerr(BN_F_BN_DIV_NO_BRANCH,BN_R_DIV_BY_ZERO); - return(0); - } - - BN_CTX_start(ctx); - tmp=BN_CTX_get(ctx); - snum=BN_CTX_get(ctx); - sdiv=BN_CTX_get(ctx); - if (dv == NULL) - res=BN_CTX_get(ctx); - else res=dv; - if (sdiv == NULL || res == NULL) goto err; - - /* First we normalise the numbers */ - norm_shift=BN_BITS2-((BN_num_bits(divisor))%BN_BITS2); - if (!(BN_lshift(sdiv,divisor,norm_shift))) goto err; - sdiv->neg=0; - norm_shift+=BN_BITS2; - if (!(BN_lshift(snum,num,norm_shift))) goto err; - snum->neg=0; - - /* Since we don't know whether snum is larger than sdiv, - * we pad snum with enough zeroes without changing its - * value. - */ - if (snum->top <= sdiv->top+1) - { - if (bn_wexpand(snum, sdiv->top + 2) == NULL) goto err; - for (i = snum->top; i < sdiv->top + 2; i++) snum->d[i] = 0; - snum->top = sdiv->top + 2; - } - else - { - if (bn_wexpand(snum, snum->top + 1) == NULL) goto err; - snum->d[snum->top] = 0; - snum->top ++; } div_n=sdiv->top; @@ -500,12 +282,27 @@ static int BN_div_no_branch(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, /* Setup to 'res' */ res->neg= (num->neg^divisor->neg); if (!bn_wexpand(res,(loop+1))) goto err; - res->top=loop-1; + res->top=loop-no_branch; resp= &(res->d[loop-1]); /* space for temp */ if (!bn_wexpand(tmp,(div_n+1))) goto err; + if (!no_branch) + { + if (BN_ucmp(&wnum,sdiv) >= 0) + { + /* If BN_DEBUG_RAND is defined BN_ucmp changes (via + * bn_pollute) the const bignum arguments => + * clean the values between top and max again */ + bn_clear_top2max(&wnum); + bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n); + *resp=1; + } + else + res->top--; + } + /* if res->top == 0 then clear the neg value otherwise decrease * the resp pointer */ if (res->top == 0) @@ -638,7 +435,7 @@ X) -> 0x%08X\n", rm->neg = neg; bn_check_top(rm); } - bn_correct_top(res); + if (no_branch) bn_correct_top(res); BN_CTX_end(ctx); return(1); err: @@ -646,5 +443,4 @@ err: BN_CTX_end(ctx); return(0); } - #endif diff --git a/src/lib/libcrypto/bn/bn_exp.c b/src/lib/libcrypto/bn/bn_exp.c index d9b6c737fc..2abf6fd678 100644 --- a/src/lib/libcrypto/bn/bn_exp.c +++ b/src/lib/libcrypto/bn/bn_exp.c @@ -113,6 +113,18 @@ #include "cryptlib.h" #include "bn_lcl.h" +#include +#ifdef _WIN32 +# include +# ifndef alloca +# define alloca _alloca +# endif +#elif defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca((s)) +# endif +#endif + /* maximum precomputation table size for *variable* sliding windows */ #define TABLE_SIZE 32 @@ -522,23 +534,17 @@ err: * as cache lines are concerned. The following functions are used to transfer a BIGNUM * from/to that table. */ -static int MOD_EXP_CTIME_COPY_TO_PREBUF(BIGNUM *b, int top, unsigned char *buf, int idx, int width) +static int MOD_EXP_CTIME_COPY_TO_PREBUF(const BIGNUM *b, int top, unsigned char *buf, int idx, int width) { size_t i, j; - if (bn_wexpand(b, top) == NULL) - return 0; - while (b->top < top) - { - b->d[b->top++] = 0; - } - + if (top > b->top) + top = b->top; /* this works because 'buf' is explicitly zeroed */ for (i = 0, j=idx; i < top * sizeof b->d[0]; i++, j+=width) { buf[j] = ((unsigned char*)b->d)[i]; } - bn_correct_top(b); return 1; } @@ -561,7 +567,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf /* Given a pointer value, compute the next address that is a cache line multiple. */ #define MOD_EXP_CTIME_ALIGN(x_) \ - ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) + ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) /* This variant of BN_mod_exp_mont() uses fixed windows and the special * precomputation memory layout to limit data-dependency to a minimum @@ -572,17 +578,15 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) { - int i,bits,ret=0,idx,window,wvalue; + int i,bits,ret=0,window,wvalue; int top; - BIGNUM *r; - const BIGNUM *aa; BN_MONT_CTX *mont=NULL; int numPowers; unsigned char *powerbufFree=NULL; int powerbufLen = 0; unsigned char *powerbuf=NULL; - BIGNUM *computeTemp=NULL, *am=NULL; + BIGNUM tmp, am; bn_check_top(a); bn_check_top(p); @@ -602,10 +606,7 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, return ret; } - /* Initialize BIGNUM context and allocate intermediate result */ BN_CTX_start(ctx); - r = BN_CTX_get(ctx); - if (r == NULL) goto err; /* Allocate a montgomery context if it was not supplied by the caller. * If this is not done, things will break in the montgomery part. @@ -620,40 +621,154 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, /* Get the window size to use with size of p. */ window = BN_window_bits_for_ctime_exponent_size(bits); +#if defined(OPENSSL_BN_ASM_MONT5) + if (window==6 && bits<=1024) window=5; /* ~5% improvement of 2048-bit RSA sign */ +#endif /* Allocate a buffer large enough to hold all of the pre-computed - * powers of a. + * powers of am, am itself and tmp. */ numPowers = 1 << window; - powerbufLen = sizeof(m->d[0])*top*numPowers; + powerbufLen = sizeof(m->d[0])*(top*numPowers + + ((2*top)>numPowers?(2*top):numPowers)); +#ifdef alloca + if (powerbufLen < 3072) + powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); + else +#endif if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL) goto err; powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree); memset(powerbuf, 0, powerbufLen); - /* Initialize the intermediate result. Do this early to save double conversion, - * once each for a^0 and intermediate result. - */ - if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err; +#ifdef alloca + if (powerbufLen < 3072) + powerbufFree = NULL; +#endif - /* Initialize computeTemp as a^1 with montgomery precalcs */ - computeTemp = BN_CTX_get(ctx); - am = BN_CTX_get(ctx); - if (computeTemp==NULL || am==NULL) goto err; + /* lay down tmp and am right after powers table */ + tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers); + am.d = tmp.d + top; + tmp.top = am.top = 0; + tmp.dmax = am.dmax = top; + tmp.neg = am.neg = 0; + tmp.flags = am.flags = BN_FLG_STATIC_DATA; + + /* prepare a^0 in Montgomery domain */ +#if 1 + if (!BN_to_montgomery(&tmp,BN_value_one(),mont,ctx)) goto err; +#else + tmp.d[0] = (0-m->d[0])&BN_MASK2; /* 2^(top*BN_BITS2) - m */ + for (i=1;id[i])&BN_MASK2; + tmp.top = top; +#endif + /* prepare a^1 in Montgomery domain */ if (a->neg || BN_ucmp(a,m) >= 0) { - if (!BN_mod(am,a,m,ctx)) - goto err; - aa= am; + if (!BN_mod(&am,a,m,ctx)) goto err; + if (!BN_to_montgomery(&am,&am,mont,ctx)) goto err; } - else - aa=a; - if (!BN_to_montgomery(am,aa,mont,ctx)) goto err; - if (!BN_copy(computeTemp, am)) goto err; - if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err; + else if (!BN_to_montgomery(&am,a,mont,ctx)) goto err; + +#if defined(OPENSSL_BN_ASM_MONT5) + /* This optimization uses ideas from http://eprint.iacr.org/2011/239, + * specifically optimization of cache-timing attack countermeasures + * and pre-computation optimization. */ + + /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as + * 512-bit RSA is hardly relevant, we omit it to spare size... */ + if (window==5) + { + void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap, + const void *table,const BN_ULONG *np, + const BN_ULONG *n0,int num,int power); + void bn_scatter5(const BN_ULONG *inp,size_t num, + void *table,size_t power); + void bn_gather5(BN_ULONG *out,size_t num, + void *table,size_t power); + + BN_ULONG *np=mont->N.d, *n0=mont->n0; + + /* BN_to_montgomery can contaminate words above .top + * [in BN_DEBUG[_DEBUG] build]... */ + for (i=am.top; i=0; i--,bits--) + wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); + bn_gather5(tmp.d,top,powerbuf,wvalue); + + /* Scan the exponent one window at a time starting from the most + * significant bits. + */ + while (bits >= 0) + { + for (wvalue=0, i=0; i<5; i++,bits--) + wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); + + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont(tmp.d,tmp.d,tmp.d,np,n0,top); + bn_mul_mont_gather5(tmp.d,tmp.d,powerbuf,np,n0,top,wvalue); + } + + tmp.top=top; + bn_correct_top(&tmp); + } + else +#endif + { + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&tmp, top, powerbuf, 0, numPowers)) goto err; + if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&am, top, powerbuf, 1, numPowers)) goto err; /* If the window size is greater than 1, then calculate * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) @@ -662,62 +777,54 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, */ if (window > 1) { - for (i=2; i= 0) + bits--; + for (wvalue=0, i=bits%window; i>=0; i--,bits--) + wvalue = (wvalue<<1)+BN_is_bit_set(p,bits); + if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&tmp,top,powerbuf,wvalue,numPowers)) goto err; + + /* Scan the exponent one window at a time starting from the most + * significant bits. + */ + while (bits >= 0) { wvalue=0; /* The 'value' of the window */ /* Scan the window, squaring the result as we go */ - for (i=0; i> 4 & 0xF] << 8 | SQR_tb[(w) & 0xF] #endif +#if !defined(OPENSSL_BN_ASM_GF2m) /* Product of two polynomials a, b each with degree < BN_BITS2 - 1, * result is a polynomial r with degree < 2 * BN_BITS - 1 * The caller MUST ensure that the variables have the right amount @@ -216,7 +219,9 @@ static void bn_GF2m_mul_2x2(BN_ULONG *r, const BN_ULONG a1, const BN_ULONG a0, c r[2] ^= m1 ^ r[1] ^ r[3]; /* h0 ^= m1 ^ l1 ^ h1; */ r[1] = r[3] ^ r[2] ^ r[0] ^ m1 ^ m0; /* l1 ^= l0 ^ h0 ^ m0; */ } - +#else +void bn_GF2m_mul_2x2(BN_ULONG *r, BN_ULONG a1, BN_ULONG a0, BN_ULONG b1, BN_ULONG b0); +#endif /* Add polynomials a and b and store result in r; r could be a or b, a and b * could be equal; r is the bitwise XOR of a and b. @@ -360,21 +365,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[]) int BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p) { int ret = 0; - const int max = BN_num_bits(p) + 1; - int *arr=NULL; + int arr[6]; bn_check_top(a); bn_check_top(p); - if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err; - ret = BN_GF2m_poly2arr(p, arr, max); - if (!ret || ret > max) + ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0])); + if (!ret || ret > (int)(sizeof(arr)/sizeof(arr[0]))) { BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH); - goto err; + return 0; } ret = BN_GF2m_mod_arr(r, a, arr); bn_check_top(r); -err: - if (arr) OPENSSL_free(arr); return ret; } @@ -521,7 +522,7 @@ err: */ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { - BIGNUM *b, *c, *u, *v, *tmp; + BIGNUM *b, *c = NULL, *u = NULL, *v = NULL, *tmp; int ret = 0; bn_check_top(a); @@ -529,18 +530,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) BN_CTX_start(ctx); - b = BN_CTX_get(ctx); - c = BN_CTX_get(ctx); - u = BN_CTX_get(ctx); - v = BN_CTX_get(ctx); - if (v == NULL) goto err; + if ((b = BN_CTX_get(ctx))==NULL) goto err; + if ((c = BN_CTX_get(ctx))==NULL) goto err; + if ((u = BN_CTX_get(ctx))==NULL) goto err; + if ((v = BN_CTX_get(ctx))==NULL) goto err; - if (!BN_one(b)) goto err; if (!BN_GF2m_mod(u, a, p)) goto err; - if (!BN_copy(v, p)) goto err; - if (BN_is_zero(u)) goto err; + if (!BN_copy(v, p)) goto err; +#if 0 + if (!BN_one(b)) goto err; + while (1) { while (!BN_is_odd(u)) @@ -565,13 +566,89 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) if (!BN_GF2m_add(u, u, v)) goto err; if (!BN_GF2m_add(b, b, c)) goto err; } +#else + { + int i, ubits = BN_num_bits(u), + vbits = BN_num_bits(v), /* v is copy of p */ + top = p->top; + BN_ULONG *udp,*bdp,*vdp,*cdp; + + bn_wexpand(u,top); udp = u->d; + for (i=u->top;itop = top; + bn_wexpand(b,top); bdp = b->d; + bdp[0] = 1; + for (i=1;itop = top; + bn_wexpand(c,top); cdp = c->d; + for (i=0;itop = top; + vdp = v->d; /* It pays off to "cache" *->d pointers, because + * it allows optimizer to be more aggressive. + * But we don't have to "cache" p->d, because *p + * is declared 'const'... */ + while (1) + { + while (ubits && !(udp[0]&1)) + { + BN_ULONG u0,u1,b0,b1,mask; + + u0 = udp[0]; + b0 = bdp[0]; + mask = (BN_ULONG)0-(b0&1); + b0 ^= p->d[0]&mask; + for (i=0;i>1)|(u1<<(BN_BITS2-1)))&BN_MASK2; + u0 = u1; + b1 = bdp[i+1]^(p->d[i+1]&mask); + bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2; + b0 = b1; + } + udp[i] = u0>>1; + bdp[i] = b0>>1; + ubits--; + } + if (ubits<=BN_BITS2 && udp[0]==1) break; + + if (ubitsd; + bdp = cdp; cdp = c->d; + } + for(i=0;i # define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b)) -# elif defined(__GNUC__) +# elif defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("umulh %1,%2,%0" \ @@ -247,7 +247,7 @@ extern "C" { ret; }) # endif /* compiler */ # elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG) -# if defined(__GNUC__) +# if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret; \ asm ("mulhdu %0,%1,%2" \ @@ -257,7 +257,7 @@ extern "C" { # endif /* compiler */ # elif (defined(__x86_64) || defined(__x86_64__)) && \ (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) -# if defined(__GNUC__) +# if defined(__GNUC__) && __GNUC__>=2 # define BN_UMULT_HIGH(a,b) ({ \ register BN_ULONG ret,discard; \ asm ("mulq %3" \ @@ -280,6 +280,19 @@ extern "C" { # define BN_UMULT_HIGH(a,b) __umulh((a),(b)) # define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high))) # endif +# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG)) +# if defined(__GNUC__) && __GNUC__>=2 +# define BN_UMULT_HIGH(a,b) ({ \ + register BN_ULONG ret; \ + asm ("dmultu %1,%2" \ + : "=h"(ret) \ + : "r"(a), "r"(b) : "l"); \ + ret; }) +# define BN_UMULT_LOHI(low,high,a,b) \ + asm ("dmultu %2,%3" \ + : "=l"(low),"=h"(high) \ + : "r"(a), "r"(b)); +# endif # endif /* cpu */ #endif /* OPENSSL_NO_ASM */ @@ -459,6 +472,10 @@ extern "C" { } #endif /* !BN_LLONG */ +#if defined(OPENSSL_DOING_MAKEDEPEND) && defined(OPENSSL_FIPS) +#undef bn_div_words +#endif + void bn_mul_normal(BN_ULONG *r,BN_ULONG *a,int na,BN_ULONG *b,int nb); void bn_mul_comba8(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); void bn_mul_comba4(BN_ULONG *r,BN_ULONG *a,BN_ULONG *b); diff --git a/src/lib/libcrypto/bn/bn_lib.c b/src/lib/libcrypto/bn/bn_lib.c index 5470fbe6ef..7a5676de69 100644 --- a/src/lib/libcrypto/bn/bn_lib.c +++ b/src/lib/libcrypto/bn/bn_lib.c @@ -139,25 +139,6 @@ const BIGNUM *BN_value_one(void) return(&const_one); } -char *BN_options(void) - { - static int init=0; - static char data[16]; - - if (!init) - { - init++; -#ifdef BN_LLONG - BIO_snprintf(data,sizeof data,"bn(%d,%d)", - (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8); -#else - BIO_snprintf(data,sizeof data,"bn(%d,%d)", - (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8); -#endif - } - return(data); - } - int BN_num_bits_word(BN_ULONG l) { static const unsigned char bits[256]={ diff --git a/src/lib/libcrypto/bn/bn_mont.c b/src/lib/libcrypto/bn/bn_mont.c index 1a866880f5..427b5cf4df 100644 --- a/src/lib/libcrypto/bn/bn_mont.c +++ b/src/lib/libcrypto/bn/bn_mont.c @@ -177,31 +177,26 @@ err: static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont) { BIGNUM *n; - BN_ULONG *ap,*np,*rp,n0,v,*nrp; - int al,nl,max,i,x,ri; + BN_ULONG *ap,*np,*rp,n0,v,carry; + int nl,max,i; n= &(mont->N); - /* mont->ri is the size of mont->N in bits (rounded up - to the word size) */ - al=ri=mont->ri/BN_BITS2; - nl=n->top; - if ((al == 0) || (nl == 0)) { ret->top=0; return(1); } + if (nl == 0) { ret->top=0; return(1); } - max=(nl+al+1); /* allow for overflow (no?) XXX */ + max=(2*nl); /* carry is stored separately */ if (bn_wexpand(r,max) == NULL) return(0); r->neg^=n->neg; np=n->d; rp=r->d; - nrp= &(r->d[nl]); /* clear the top words of T */ #if 1 for (i=r->top; id[i]=0; + rp[i]=0; #else - memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); + memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); #endif r->top=max; @@ -210,7 +205,7 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont) #ifdef BN_COUNT fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl); #endif - for (i=0; i= v) - continue; - else - { - if (((++nrp[0])&BN_MASK2) != 0) continue; - if (((++nrp[1])&BN_MASK2) != 0) continue; - for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ; - } - } - bn_correct_top(r); - - /* mont->ri will be a multiple of the word size and below code - * is kind of BN_rshift(ret,r,mont->ri) equivalent */ - if (r->top <= ri) - { - ret->top=0; - return(1); + v = (v+carry+rp[nl])&BN_MASK2; + carry |= (v != rp[nl]); + carry &= (v <= rp[nl]); + rp[nl]=v; } - al=r->top-ri; -#define BRANCH_FREE 1 -#if BRANCH_FREE - if (bn_wexpand(ret,ri) == NULL) return(0); - x=0-(((al-ri)>>(sizeof(al)*8-1))&1); - ret->top=x=(ri&~x)|(al&x); /* min(ri,al) */ + if (bn_wexpand(ret,nl) == NULL) return(0); + ret->top=nl; ret->neg=r->neg; rp=ret->d; - ap=&(r->d[ri]); + ap=&(r->d[nl]); +#define BRANCH_FREE 1 +#if BRANCH_FREE { - size_t m1,m2; - - v=bn_sub_words(rp,ap,np,ri); - /* this ----------------^^ works even in alri) nrp=rp; else nrp=ap; */ - /* in other words if subtraction result is real, then + v=bn_sub_words(rp,ap,np,nl)-carry; + /* if subtraction result is real, then * trick unconditional memcpy below to perform in-place * "refresh" instead of actual copy. */ - m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1); /* al>(sizeof(al)*8-1))&1); /* al>ri */ - m1|=m2; /* (al!=ri) */ - m1|=(0-(size_t)v); /* (al!=ri || v) */ - m1&=~m2; /* (al!=ri || v) && !al>ri */ - nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m1)|((PTR_SIZE_INT)ap&m1)); - } + m=(0-(size_t)v); + nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m)); - /* 'itop=al; - ret->neg=r->neg; - - rp=ret->d; - ap=&(r->d[ri]); - al-=4; - for (i=0; iN)) >= 0) - { - if (!BN_usub(ret,ret,&(mont->N))) return(0); - } + if (bn_sub_words (rp,ap,np,nl)-carry) + memcpy(rp,ap,nl*sizeof(BN_ULONG)); #endif + bn_correct_top(r); + bn_correct_top(ret); bn_check_top(ret); return(1); diff --git a/src/lib/libcrypto/bn/bn_nist.c b/src/lib/libcrypto/bn/bn_nist.c index c6de032696..43caee4770 100644 --- a/src/lib/libcrypto/bn/bn_nist.c +++ b/src/lib/libcrypto/bn/bn_nist.c @@ -319,6 +319,13 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top) :(to[(n)/2] =((m)&1)?(from[(m)/2]>>32):(from[(m)/2]&BN_MASK2l))) #define bn_32_set_0(to, n) (((n)&1)?(to[(n)/2]&=BN_MASK2l):(to[(n)/2]=0)); #define bn_cp_32(to,n,from,m) ((m)>=0)?bn_cp_32_naked(to,n,from,m):bn_32_set_0(to,n) +# if defined(L_ENDIAN) +# if defined(__arch64__) +# define NIST_INT64 long +# else +# define NIST_INT64 long long +# endif +# endif #else #define bn_cp_64(to, n, from, m) \ { \ @@ -330,13 +337,15 @@ static void nist_cp_bn(BN_ULONG *buf, BN_ULONG *a, int top) bn_32_set_0(to, (n)*2); \ bn_32_set_0(to, (n)*2+1); \ } -#if BN_BITS2 == 32 #define bn_cp_32(to, n, from, m) (to)[n] = (m>=0)?((from)[m]):0; #define bn_32_set_0(to, n) (to)[n] = (BN_ULONG)0; -#endif +# if defined(_WIN32) && !defined(__GNUC__) +# define NIST_INT64 __int64 +# elif defined(BN_LLONG) +# define NIST_INT64 long long +# endif #endif /* BN_BITS2 != 64 */ - #define nist_set_192(to, from, a1, a2, a3) \ { \ bn_cp_64(to, 0, from, (a3) - 3) \ @@ -350,9 +359,11 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int top = a->top, i; int carry; register BN_ULONG *r_d, *a_d = a->d; - BN_ULONG t_d[BN_NIST_192_TOP], - buf[BN_NIST_192_TOP], - c_d[BN_NIST_192_TOP], + union { + BN_ULONG bn[BN_NIST_192_TOP]; + unsigned int ui[BN_NIST_192_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)]; + } buf; + BN_ULONG c_d[BN_NIST_192_TOP], *res; PTR_SIZE_INT mask; static const BIGNUM _bignum_nist_p_192_sqr = { @@ -385,15 +396,48 @@ int BN_nist_mod_192(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else r_d = a_d; - nist_cp_bn_0(buf, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP); + nist_cp_bn_0(buf.bn, a_d + BN_NIST_192_TOP, top - BN_NIST_192_TOP, BN_NIST_192_TOP); + +#if defined(NIST_INT64) + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf.ui; + + acc = rp[0]; acc += bp[3*2-6]; + acc += bp[5*2-6]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc += bp[3*2-5]; + acc += bp[5*2-5]; rp[1] = (unsigned int)acc; acc >>= 32; - nist_set_192(t_d, buf, 0, 3, 3); + acc += rp[2]; acc += bp[3*2-6]; + acc += bp[4*2-6]; + acc += bp[5*2-6]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[3*2-5]; + acc += bp[4*2-5]; + acc += bp[5*2-5]; rp[3] = (unsigned int)acc; acc >>= 32; + + acc += rp[4]; acc += bp[4*2-6]; + acc += bp[5*2-6]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[4*2-5]; + acc += bp[5*2-5]; rp[5] = (unsigned int)acc; + + carry = (int)(acc>>32); + } +#else + { + BN_ULONG t_d[BN_NIST_192_TOP]; + + nist_set_192(t_d, buf.bn, 0, 3, 3); carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); - nist_set_192(t_d, buf, 4, 4, 0); + nist_set_192(t_d, buf.bn, 4, 4, 0); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); - nist_set_192(t_d, buf, 5, 5, 5) + nist_set_192(t_d, buf.bn, 5, 5, 5) carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_192_TOP); - + } +#endif if (carry > 0) carry = (int)bn_sub_words(r_d,r_d,_nist_p_192[carry-1],BN_NIST_192_TOP); else @@ -435,8 +479,7 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int top = a->top, i; int carry; BN_ULONG *r_d, *a_d = a->d; - BN_ULONG t_d[BN_NIST_224_TOP], - buf[BN_NIST_224_TOP], + BN_ULONG buf[BN_NIST_224_TOP], c_d[BN_NIST_224_TOP], *res; PTR_SIZE_INT mask; @@ -474,14 +517,54 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, #if BN_BITS2==64 /* copy upper 256 bits of 448 bit number ... */ - nist_cp_bn_0(t_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP); + nist_cp_bn_0(c_d, a_d + (BN_NIST_224_TOP-1), top - (BN_NIST_224_TOP-1), BN_NIST_224_TOP); /* ... and right shift by 32 to obtain upper 224 bits */ - nist_set_224(buf, t_d, 14, 13, 12, 11, 10, 9, 8); + nist_set_224(buf, c_d, 14, 13, 12, 11, 10, 9, 8); /* truncate lower part to 224 bits too */ r_d[BN_NIST_224_TOP-1] &= BN_MASK2l; #else nist_cp_bn_0(buf, a_d + BN_NIST_224_TOP, top - BN_NIST_224_TOP, BN_NIST_224_TOP); #endif + +#if defined(NIST_INT64) && BN_BITS2!=64 + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf; + + acc = rp[0]; acc -= bp[7-7]; + acc -= bp[11-7]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc -= bp[8-7]; + acc -= bp[12-7]; rp[1] = (unsigned int)acc; acc >>= 32; + + acc += rp[2]; acc -= bp[9-7]; + acc -= bp[13-7]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[7-7]; + acc += bp[11-7]; + acc -= bp[10-7]; rp[3] = (unsigned int)acc; acc>>= 32; + + acc += rp[4]; acc += bp[8-7]; + acc += bp[12-7]; + acc -= bp[11-7]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[9-7]; + acc += bp[13-7]; + acc -= bp[12-7]; rp[5] = (unsigned int)acc; acc >>= 32; + + acc += rp[6]; acc += bp[10-7]; + acc -= bp[13-7]; rp[6] = (unsigned int)acc; + + carry = (int)(acc>>32); +# if BN_BITS2==64 + rp[7] = carry; +# endif + } +#else + { + BN_ULONG t_d[BN_NIST_224_TOP]; + nist_set_224(t_d, buf, 10, 9, 8, 7, 0, 0, 0); carry = (int)bn_add_words(r_d, r_d, t_d, BN_NIST_224_TOP); nist_set_224(t_d, buf, 0, 13, 12, 11, 0, 0, 0); @@ -493,6 +576,8 @@ int BN_nist_mod_224(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, #if BN_BITS2==64 carry = (int)(r_d[BN_NIST_224_TOP-1]>>32); +#endif + } #endif u.f = bn_sub_words; if (carry > 0) @@ -548,9 +633,11 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int i, top = a->top; int carry = 0; register BN_ULONG *a_d = a->d, *r_d; - BN_ULONG t_d[BN_NIST_256_TOP], - buf[BN_NIST_256_TOP], - c_d[BN_NIST_256_TOP], + union { + BN_ULONG bn[BN_NIST_256_TOP]; + unsigned int ui[BN_NIST_256_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)]; + } buf; + BN_ULONG c_d[BN_NIST_256_TOP], *res; PTR_SIZE_INT mask; union { bn_addsub_f f; PTR_SIZE_INT p; } u; @@ -584,12 +671,87 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else r_d = a_d; - nist_cp_bn_0(buf, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP); + nist_cp_bn_0(buf.bn, a_d + BN_NIST_256_TOP, top - BN_NIST_256_TOP, BN_NIST_256_TOP); + +#if defined(NIST_INT64) + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf.ui; + + acc = rp[0]; acc += bp[8-8]; + acc += bp[9-8]; + acc -= bp[11-8]; + acc -= bp[12-8]; + acc -= bp[13-8]; + acc -= bp[14-8]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc += bp[9-8]; + acc += bp[10-8]; + acc -= bp[12-8]; + acc -= bp[13-8]; + acc -= bp[14-8]; + acc -= bp[15-8]; rp[1] = (unsigned int)acc; acc >>= 32; + + acc += rp[2]; acc += bp[10-8]; + acc += bp[11-8]; + acc -= bp[13-8]; + acc -= bp[14-8]; + acc -= bp[15-8]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[11-8]; + acc += bp[11-8]; + acc += bp[12-8]; + acc += bp[12-8]; + acc += bp[13-8]; + acc -= bp[15-8]; + acc -= bp[8-8]; + acc -= bp[9-8]; rp[3] = (unsigned int)acc; acc >>= 32; + + acc += rp[4]; acc += bp[12-8]; + acc += bp[12-8]; + acc += bp[13-8]; + acc += bp[13-8]; + acc += bp[14-8]; + acc -= bp[9-8]; + acc -= bp[10-8]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[13-8]; + acc += bp[13-8]; + acc += bp[14-8]; + acc += bp[14-8]; + acc += bp[15-8]; + acc -= bp[10-8]; + acc -= bp[11-8]; rp[5] = (unsigned int)acc; acc >>= 32; + + acc += rp[6]; acc += bp[14-8]; + acc += bp[14-8]; + acc += bp[15-8]; + acc += bp[15-8]; + acc += bp[14-8]; + acc += bp[13-8]; + acc -= bp[8-8]; + acc -= bp[9-8]; rp[6] = (unsigned int)acc; acc >>= 32; + + acc += rp[7]; acc += bp[15-8]; + acc += bp[15-8]; + acc += bp[15-8]; + acc += bp[8 -8]; + acc -= bp[10-8]; + acc -= bp[11-8]; + acc -= bp[12-8]; + acc -= bp[13-8]; rp[7] = (unsigned int)acc; + + carry = (int)(acc>>32); + } +#else + { + BN_ULONG t_d[BN_NIST_256_TOP]; /*S1*/ - nist_set_256(t_d, buf, 15, 14, 13, 12, 11, 0, 0, 0); + nist_set_256(t_d, buf.bn, 15, 14, 13, 12, 11, 0, 0, 0); /*S2*/ - nist_set_256(c_d, buf, 0, 15, 14, 13, 12, 0, 0, 0); + nist_set_256(c_d, buf.bn, 0, 15, 14, 13, 12, 0, 0, 0); carry = (int)bn_add_words(t_d, t_d, c_d, BN_NIST_256_TOP); /* left shift */ { @@ -607,24 +769,26 @@ int BN_nist_mod_256(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, } carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*S3*/ - nist_set_256(t_d, buf, 15, 14, 0, 0, 0, 10, 9, 8); + nist_set_256(t_d, buf.bn, 15, 14, 0, 0, 0, 10, 9, 8); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*S4*/ - nist_set_256(t_d, buf, 8, 13, 15, 14, 13, 11, 10, 9); + nist_set_256(t_d, buf.bn, 8, 13, 15, 14, 13, 11, 10, 9); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D1*/ - nist_set_256(t_d, buf, 10, 8, 0, 0, 0, 13, 12, 11); + nist_set_256(t_d, buf.bn, 10, 8, 0, 0, 0, 13, 12, 11); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D2*/ - nist_set_256(t_d, buf, 11, 9, 0, 0, 15, 14, 13, 12); + nist_set_256(t_d, buf.bn, 11, 9, 0, 0, 15, 14, 13, 12); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D3*/ - nist_set_256(t_d, buf, 12, 0, 10, 9, 8, 15, 14, 13); + nist_set_256(t_d, buf.bn, 12, 0, 10, 9, 8, 15, 14, 13); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); /*D4*/ - nist_set_256(t_d, buf, 13, 0, 11, 10, 9, 0, 15, 14); + nist_set_256(t_d, buf.bn, 13, 0, 11, 10, 9, 0, 15, 14); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_256_TOP); + } +#endif /* see BN_nist_mod_224 for explanation */ u.f = bn_sub_words; if (carry > 0) @@ -672,9 +836,11 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, int i, top = a->top; int carry = 0; register BN_ULONG *r_d, *a_d = a->d; - BN_ULONG t_d[BN_NIST_384_TOP], - buf[BN_NIST_384_TOP], - c_d[BN_NIST_384_TOP], + union { + BN_ULONG bn[BN_NIST_384_TOP]; + unsigned int ui[BN_NIST_384_TOP*sizeof(BN_ULONG)/sizeof(unsigned int)]; + } buf; + BN_ULONG c_d[BN_NIST_384_TOP], *res; PTR_SIZE_INT mask; union { bn_addsub_f f; PTR_SIZE_INT p; } u; @@ -709,10 +875,100 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, else r_d = a_d; - nist_cp_bn_0(buf, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP); + nist_cp_bn_0(buf.bn, a_d + BN_NIST_384_TOP, top - BN_NIST_384_TOP, BN_NIST_384_TOP); + +#if defined(NIST_INT64) + { + NIST_INT64 acc; /* accumulator */ + unsigned int *rp=(unsigned int *)r_d; + const unsigned int *bp=(const unsigned int *)buf.ui; + + acc = rp[0]; acc += bp[12-12]; + acc += bp[21-12]; + acc += bp[20-12]; + acc -= bp[23-12]; rp[0] = (unsigned int)acc; acc >>= 32; + + acc += rp[1]; acc += bp[13-12]; + acc += bp[22-12]; + acc += bp[23-12]; + acc -= bp[12-12]; + acc -= bp[20-12]; rp[1] = (unsigned int)acc; acc >>= 32; + + acc += rp[2]; acc += bp[14-12]; + acc += bp[23-12]; + acc -= bp[13-12]; + acc -= bp[21-12]; rp[2] = (unsigned int)acc; acc >>= 32; + + acc += rp[3]; acc += bp[15-12]; + acc += bp[12-12]; + acc += bp[20-12]; + acc += bp[21-12]; + acc -= bp[14-12]; + acc -= bp[22-12]; + acc -= bp[23-12]; rp[3] = (unsigned int)acc; acc >>= 32; + + acc += rp[4]; acc += bp[21-12]; + acc += bp[21-12]; + acc += bp[16-12]; + acc += bp[13-12]; + acc += bp[12-12]; + acc += bp[20-12]; + acc += bp[22-12]; + acc -= bp[15-12]; + acc -= bp[23-12]; + acc -= bp[23-12]; rp[4] = (unsigned int)acc; acc >>= 32; + + acc += rp[5]; acc += bp[22-12]; + acc += bp[22-12]; + acc += bp[17-12]; + acc += bp[14-12]; + acc += bp[13-12]; + acc += bp[21-12]; + acc += bp[23-12]; + acc -= bp[16-12]; rp[5] = (unsigned int)acc; acc >>= 32; + + acc += rp[6]; acc += bp[23-12]; + acc += bp[23-12]; + acc += bp[18-12]; + acc += bp[15-12]; + acc += bp[14-12]; + acc += bp[22-12]; + acc -= bp[17-12]; rp[6] = (unsigned int)acc; acc >>= 32; + + acc += rp[7]; acc += bp[19-12]; + acc += bp[16-12]; + acc += bp[15-12]; + acc += bp[23-12]; + acc -= bp[18-12]; rp[7] = (unsigned int)acc; acc >>= 32; + + acc += rp[8]; acc += bp[20-12]; + acc += bp[17-12]; + acc += bp[16-12]; + acc -= bp[19-12]; rp[8] = (unsigned int)acc; acc >>= 32; + + acc += rp[9]; acc += bp[21-12]; + acc += bp[18-12]; + acc += bp[17-12]; + acc -= bp[20-12]; rp[9] = (unsigned int)acc; acc >>= 32; + + acc += rp[10]; acc += bp[22-12]; + acc += bp[19-12]; + acc += bp[18-12]; + acc -= bp[21-12]; rp[10] = (unsigned int)acc; acc >>= 32; + + acc += rp[11]; acc += bp[23-12]; + acc += bp[20-12]; + acc += bp[19-12]; + acc -= bp[22-12]; rp[11] = (unsigned int)acc; + + carry = (int)(acc>>32); + } +#else + { + BN_ULONG t_d[BN_NIST_384_TOP]; /*S1*/ - nist_set_256(t_d, buf, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4); + nist_set_256(t_d, buf.bn, 0, 0, 0, 0, 0, 23-4, 22-4, 21-4); /* left shift */ { register BN_ULONG *ap,t,c; @@ -729,29 +985,31 @@ int BN_nist_mod_384(BIGNUM *r, const BIGNUM *a, const BIGNUM *field, carry = (int)bn_add_words(r_d+(128/BN_BITS2), r_d+(128/BN_BITS2), t_d, BN_NIST_256_TOP); /*S2 */ - carry += (int)bn_add_words(r_d, r_d, buf, BN_NIST_384_TOP); + carry += (int)bn_add_words(r_d, r_d, buf.bn, BN_NIST_384_TOP); /*S3*/ - nist_set_384(t_d,buf,20,19,18,17,16,15,14,13,12,23,22,21); + nist_set_384(t_d,buf.bn,20,19,18,17,16,15,14,13,12,23,22,21); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*S4*/ - nist_set_384(t_d,buf,19,18,17,16,15,14,13,12,20,0,23,0); + nist_set_384(t_d,buf.bn,19,18,17,16,15,14,13,12,20,0,23,0); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*S5*/ - nist_set_384(t_d, buf,0,0,0,0,23,22,21,20,0,0,0,0); + nist_set_384(t_d, buf.bn,0,0,0,0,23,22,21,20,0,0,0,0); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*S6*/ - nist_set_384(t_d,buf,0,0,0,0,0,0,23,22,21,0,0,20); + nist_set_384(t_d,buf.bn,0,0,0,0,0,0,23,22,21,0,0,20); carry += (int)bn_add_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*D1*/ - nist_set_384(t_d,buf,22,21,20,19,18,17,16,15,14,13,12,23); + nist_set_384(t_d,buf.bn,22,21,20,19,18,17,16,15,14,13,12,23); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*D2*/ - nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,22,21,20,0); + nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,22,21,20,0); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP); /*D3*/ - nist_set_384(t_d,buf,0,0,0,0,0,0,0,23,23,0,0,0); + nist_set_384(t_d,buf.bn,0,0,0,0,0,0,0,23,23,0,0,0); carry -= (int)bn_sub_words(r_d, r_d, t_d, BN_NIST_384_TOP); + } +#endif /* see BN_nist_mod_224 for explanation */ u.f = bn_sub_words; if (carry > 0) diff --git a/src/lib/libcrypto/bn/bn_print.c b/src/lib/libcrypto/bn/bn_print.c index bebb466d08..1743b6a7e2 100644 --- a/src/lib/libcrypto/bn/bn_print.c +++ b/src/lib/libcrypto/bn/bn_print.c @@ -357,3 +357,22 @@ end: return(ret); } #endif + +char *BN_options(void) + { + static int init=0; + static char data[16]; + + if (!init) + { + init++; +#ifdef BN_LLONG + BIO_snprintf(data,sizeof data,"bn(%d,%d)", + (int)sizeof(BN_ULLONG)*8,(int)sizeof(BN_ULONG)*8); +#else + BIO_snprintf(data,sizeof data,"bn(%d,%d)", + (int)sizeof(BN_ULONG)*8,(int)sizeof(BN_ULONG)*8); +#endif + } + return(data); + } diff --git a/src/lib/libcrypto/bn/bn_shift.c b/src/lib/libcrypto/bn/bn_shift.c index c4d301afc4..a6fca2c424 100644 --- a/src/lib/libcrypto/bn/bn_shift.c +++ b/src/lib/libcrypto/bn/bn_shift.c @@ -99,7 +99,7 @@ int BN_lshift1(BIGNUM *r, const BIGNUM *a) int BN_rshift1(BIGNUM *r, const BIGNUM *a) { BN_ULONG *ap,*rp,t,c; - int i; + int i,j; bn_check_top(r); bn_check_top(a); @@ -109,22 +109,25 @@ int BN_rshift1(BIGNUM *r, const BIGNUM *a) BN_zero(r); return(1); } + i = a->top; + ap= a->d; + j = i-(ap[i-1]==1); if (a != r) { - if (bn_wexpand(r,a->top) == NULL) return(0); - r->top=a->top; + if (bn_wexpand(r,j) == NULL) return(0); r->neg=a->neg; } - ap=a->d; rp=r->d; - c=0; - for (i=a->top-1; i>=0; i--) + t=ap[--i]; + c=(t&1)?BN_TBIT:0; + if (t>>=1) rp[i]=t; + while (i>0) { - t=ap[i]; + t=ap[--i]; rp[i]=((t>>1)&BN_MASK2)|c; c=(t&1)?BN_TBIT:0; } - bn_correct_top(r); + r->top=j; bn_check_top(r); return(1); } @@ -182,10 +185,11 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) BN_zero(r); return(1); } + i = (BN_num_bits(a)-n+(BN_BITS2-1))/BN_BITS2; if (r != a) { r->neg=a->neg; - if (bn_wexpand(r,a->top-nw+1) == NULL) return(0); + if (bn_wexpand(r,i) == NULL) return(0); } else { @@ -196,7 +200,7 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) f= &(a->d[nw]); t=r->d; j=a->top-nw; - r->top=j; + r->top=i; if (rb == 0) { @@ -212,9 +216,8 @@ int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) l= *(f++); *(t++) =(tmp|(l<>rb)&BN_MASK2; + if ((l = (l>>rb)&BN_MASK2)) *(t) = l; } - bn_correct_top(r); bn_check_top(r); return(1); } diff --git a/src/lib/libcrypto/buffer/buf_str.c b/src/lib/libcrypto/buffer/buf_str.c index 28dd1e401e..151f5ea971 100644 --- a/src/lib/libcrypto/buffer/buf_str.c +++ b/src/lib/libcrypto/buffer/buf_str.c @@ -1,56 +1,59 @@ -/* crypto/buffer/buf_str.c */ -/* ==================================================================== - * Copyright (c) 2007 The OpenSSL Project. All rights reserved. +/* crypto/buffer/buffer.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] */ #include diff --git a/src/lib/libcrypto/buffer/buffer.c b/src/lib/libcrypto/buffer/buffer.c index 620ea8d536..d7aa79ad7f 100644 --- a/src/lib/libcrypto/buffer/buffer.c +++ b/src/lib/libcrypto/buffer/buffer.c @@ -60,6 +60,11 @@ #include "cryptlib.h" #include +/* LIMIT_BEFORE_EXPANSION is the maximum n such that (n+3)/3*4 < 2**31. That + * function is applied in several functions in this file and this limit ensures + * that the result fits in an int. */ +#define LIMIT_BEFORE_EXPANSION 0x5ffffffc + BUF_MEM *BUF_MEM_new(void) { BUF_MEM *ret; @@ -105,6 +110,12 @@ int BUF_MEM_grow(BUF_MEM *str, size_t len) str->length=len; return(len); } + /* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */ + if (len > LIMIT_BEFORE_EXPANSION) + { + BUFerr(BUF_F_BUF_MEM_GROW,ERR_R_MALLOC_FAILURE); + return 0; + } n=(len+3)/3*4; if (str->data == NULL) ret=OPENSSL_malloc(n); @@ -142,6 +153,12 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len) str->length=len; return(len); } + /* This limit is sufficient to ensure (len+3)/3*4 < 2**31 */ + if (len > LIMIT_BEFORE_EXPANSION) + { + BUFerr(BUF_F_BUF_MEM_GROW_CLEAN,ERR_R_MALLOC_FAILURE); + return 0; + } n=(len+3)/3*4; if (str->data == NULL) ret=OPENSSL_malloc(n); @@ -162,64 +179,6 @@ int BUF_MEM_grow_clean(BUF_MEM *str, size_t len) return(len); } -char *BUF_strdup(const char *str) - { - if (str == NULL) return(NULL); - return BUF_strndup(str, strlen(str)); - } - -char *BUF_strndup(const char *str, size_t siz) - { - char *ret; - - if (str == NULL) return(NULL); - - ret=OPENSSL_malloc(siz+1); - if (ret == NULL) - { - BUFerr(BUF_F_BUF_STRNDUP,ERR_R_MALLOC_FAILURE); - return(NULL); - } - BUF_strlcpy(ret,str,siz+1); - return(ret); - } - -void *BUF_memdup(const void *data, size_t siz) - { - void *ret; - - if (data == NULL) return(NULL); - - ret=OPENSSL_malloc(siz); - if (ret == NULL) - { - BUFerr(BUF_F_BUF_MEMDUP,ERR_R_MALLOC_FAILURE); - return(NULL); - } - return memcpy(ret, data, siz); - } - -size_t BUF_strlcpy(char *dst, const char *src, size_t size) - { - size_t l = 0; - for(; size > 1 && *src; size--) - { - *dst++ = *src++; - l++; - } - if (size) - *dst = '\0'; - return l + strlen(src); - } - -size_t BUF_strlcat(char *dst, const char *src, size_t size) - { - size_t l = 0; - for(; size > 0 && *dst; size--, dst++) - l++; - return l + BUF_strlcpy(dst, src, size); - } - void BUF_reverse(unsigned char *out, unsigned char *in, size_t size) { size_t i; diff --git a/src/lib/libcrypto/camellia/asm/cmll-x86.pl b/src/lib/libcrypto/camellia/asm/cmll-x86.pl index 027302ac86..c314d62312 100644 --- a/src/lib/libcrypto/camellia/asm/cmll-x86.pl +++ b/src/lib/libcrypto/camellia/asm/cmll-x86.pl @@ -723,11 +723,11 @@ my $bias=int(@T[0])?shift(@T):0; &function_end("Camellia_Ekeygen"); if ($OPENSSL) { -# int Camellia_set_key ( +# int private_Camellia_set_key ( # const unsigned char *userKey, # int bits, # CAMELLIA_KEY *key) -&function_begin_B("Camellia_set_key"); +&function_begin_B("private_Camellia_set_key"); &push ("ebx"); &mov ("ecx",&wparam(0)); # pull arguments &mov ("ebx",&wparam(1)); @@ -760,7 +760,7 @@ if ($OPENSSL) { &set_label("done",4); &pop ("ebx"); &ret (); -&function_end_B("Camellia_set_key"); +&function_end_B("private_Camellia_set_key"); } @SBOX=( diff --git a/src/lib/libcrypto/camellia/camellia.h b/src/lib/libcrypto/camellia/camellia.h index cf0457dd97..67911e0adf 100644 --- a/src/lib/libcrypto/camellia/camellia.h +++ b/src/lib/libcrypto/camellia/camellia.h @@ -88,6 +88,10 @@ struct camellia_key_st }; typedef struct camellia_key_st CAMELLIA_KEY; +#ifdef OPENSSL_FIPS +int private_Camellia_set_key(const unsigned char *userKey, const int bits, + CAMELLIA_KEY *key); +#endif int Camellia_set_key(const unsigned char *userKey, const int bits, CAMELLIA_KEY *key); diff --git a/src/lib/libcrypto/camellia/cmll_locl.h b/src/lib/libcrypto/camellia/cmll_locl.h index 4a4d880d16..246b6ce1d8 100644 --- a/src/lib/libcrypto/camellia/cmll_locl.h +++ b/src/lib/libcrypto/camellia/cmll_locl.h @@ -71,7 +71,8 @@ typedef unsigned int u32; typedef unsigned char u8; -int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE keyTable); +int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, + KEY_TABLE_TYPE keyTable); void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[], const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[], @@ -80,4 +81,6 @@ void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[], const KEY_TABLE_TYPE keyTable, u8 ciphertext[]); void Camellia_DecryptBlock(int keyBitLength, const u8 ciphertext[], const KEY_TABLE_TYPE keyTable, u8 plaintext[]); +int private_Camellia_set_key(const unsigned char *userKey, const int bits, + CAMELLIA_KEY *key); #endif /* #ifndef HEADER_CAMELLIA_LOCL_H */ diff --git a/src/lib/libcrypto/camellia/cmll_misc.c b/src/lib/libcrypto/camellia/cmll_misc.c index f44689124b..f44d48564c 100644 --- a/src/lib/libcrypto/camellia/cmll_misc.c +++ b/src/lib/libcrypto/camellia/cmll_misc.c @@ -50,12 +50,13 @@ */ #include +#include #include #include "cmll_locl.h" const char CAMELLIA_version[]="CAMELLIA" OPENSSL_VERSION_PTEXT; -int Camellia_set_key(const unsigned char *userKey, const int bits, +int private_Camellia_set_key(const unsigned char *userKey, const int bits, CAMELLIA_KEY *key) { if(!userKey || !key) diff --git a/src/lib/libcrypto/cast/c_skey.c b/src/lib/libcrypto/cast/c_skey.c index 76e40005c9..cb6bf9fee3 100644 --- a/src/lib/libcrypto/cast/c_skey.c +++ b/src/lib/libcrypto/cast/c_skey.c @@ -56,6 +56,7 @@ * [including the GNU Public Licence.] */ +#include #include #include "cast_lcl.h" #include "cast_s.h" @@ -71,8 +72,14 @@ #define S5 CAST_S_table5 #define S6 CAST_S_table6 #define S7 CAST_S_table7 - void CAST_set_key(CAST_KEY *key, int len, const unsigned char *data) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(CAST); + private_CAST_set_key(key, len, data); + } +void private_CAST_set_key(CAST_KEY *key, int len, const unsigned char *data) +#endif { CAST_LONG x[16]; CAST_LONG z[16]; diff --git a/src/lib/libcrypto/cast/cast.h b/src/lib/libcrypto/cast/cast.h index 1a264f8143..203922ea2b 100644 --- a/src/lib/libcrypto/cast/cast.h +++ b/src/lib/libcrypto/cast/cast.h @@ -83,7 +83,9 @@ typedef struct cast_key_st int short_key; /* Use reduced rounds for short key */ } CAST_KEY; - +#ifdef OPENSSL_FIPS +void private_CAST_set_key(CAST_KEY *key, int len, const unsigned char *data); +#endif void CAST_set_key(CAST_KEY *key, int len, const unsigned char *data); void CAST_ecb_encrypt(const unsigned char *in, unsigned char *out, const CAST_KEY *key, int enc); diff --git a/src/lib/libcrypto/cmac/cm_ameth.c b/src/lib/libcrypto/cmac/cm_ameth.c new file mode 100644 index 0000000000..0b8e5670b0 --- /dev/null +++ b/src/lib/libcrypto/cmac/cm_ameth.c @@ -0,0 +1,97 @@ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2010. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include +#include "cryptlib.h" +#include +#include +#include "asn1_locl.h" + +/* CMAC "ASN1" method. This is just here to indicate the + * maximum CMAC output length and to free up a CMAC + * key. + */ + +static int cmac_size(const EVP_PKEY *pkey) + { + return EVP_MAX_BLOCK_LENGTH; + } + +static void cmac_key_free(EVP_PKEY *pkey) + { + CMAC_CTX *cmctx = (CMAC_CTX *)pkey->pkey.ptr; + if (cmctx) + CMAC_CTX_free(cmctx); + } + +const EVP_PKEY_ASN1_METHOD cmac_asn1_meth = + { + EVP_PKEY_CMAC, + EVP_PKEY_CMAC, + 0, + + "CMAC", + "OpenSSL CMAC method", + + 0,0,0,0, + + 0,0,0, + + cmac_size, + 0, + 0,0,0,0,0,0,0, + + cmac_key_free, + 0, + 0,0 + }; + diff --git a/src/lib/libcrypto/cmac/cm_pmeth.c b/src/lib/libcrypto/cmac/cm_pmeth.c new file mode 100644 index 0000000000..072228ec7f --- /dev/null +++ b/src/lib/libcrypto/cmac/cm_pmeth.c @@ -0,0 +1,224 @@ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project 2010. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include +#include "cryptlib.h" +#include +#include +#include +#include +#include "evp_locl.h" + +/* The context structure and "key" is simply a CMAC_CTX */ + +static int pkey_cmac_init(EVP_PKEY_CTX *ctx) + { + ctx->data = CMAC_CTX_new(); + if (!ctx->data) + return 0; + ctx->keygen_info_count = 0; + return 1; + } + +static int pkey_cmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) + { + if (!pkey_cmac_init(dst)) + return 0; + if (!CMAC_CTX_copy(dst->data, src->data)) + return 0; + return 1; + } + +static void pkey_cmac_cleanup(EVP_PKEY_CTX *ctx) + { + CMAC_CTX_free(ctx->data); + } + +static int pkey_cmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) + { + CMAC_CTX *cmkey = CMAC_CTX_new(); + CMAC_CTX *cmctx = ctx->data; + if (!cmkey) + return 0; + if (!CMAC_CTX_copy(cmkey, cmctx)) + { + CMAC_CTX_free(cmkey); + return 0; + } + EVP_PKEY_assign(pkey, EVP_PKEY_CMAC, cmkey); + + return 1; + } + +static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) + { + if (!CMAC_Update(ctx->pctx->data, data, count)) + return 0; + return 1; + } + +static int cmac_signctx_init(EVP_PKEY_CTX *ctx, EVP_MD_CTX *mctx) + { + EVP_MD_CTX_set_flags(mctx, EVP_MD_CTX_FLAG_NO_INIT); + mctx->update = int_update; + return 1; + } + +static int cmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, + EVP_MD_CTX *mctx) + { + return CMAC_Final(ctx->data, sig, siglen); + } + +static int pkey_cmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) + { + CMAC_CTX *cmctx = ctx->data; + switch (type) + { + + case EVP_PKEY_CTRL_SET_MAC_KEY: + if (!p2 || p1 < 0) + return 0; + if (!CMAC_Init(cmctx, p2, p1, NULL, NULL)) + return 0; + break; + + case EVP_PKEY_CTRL_CIPHER: + if (!CMAC_Init(cmctx, NULL, 0, p2, ctx->engine)) + return 0; + break; + + case EVP_PKEY_CTRL_MD: + if (ctx->pkey && !CMAC_CTX_copy(ctx->data, + (CMAC_CTX *)ctx->pkey->pkey.ptr)) + return 0; + if (!CMAC_Init(cmctx, NULL, 0, NULL, NULL)) + return 0; + break; + + default: + return -2; + + } + return 1; + } + +static int pkey_cmac_ctrl_str(EVP_PKEY_CTX *ctx, + const char *type, const char *value) + { + if (!value) + { + return 0; + } + if (!strcmp(type, "key")) + { + void *p = (void *)value; + return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, + strlen(p), p); + } + if (!strcmp(type, "cipher")) + { + const EVP_CIPHER *c; + c = EVP_get_cipherbyname(value); + if (!c) + return 0; + return pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_CIPHER, -1, (void *)c); + } + if (!strcmp(type, "hexkey")) + { + unsigned char *key; + int r; + long keylen; + key = string_to_hex(value, &keylen); + if (!key) + return 0; + r = pkey_cmac_ctrl(ctx, EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key); + OPENSSL_free(key); + return r; + } + return -2; + } + +const EVP_PKEY_METHOD cmac_pkey_meth = + { + EVP_PKEY_CMAC, + EVP_PKEY_FLAG_SIGCTX_CUSTOM, + pkey_cmac_init, + pkey_cmac_copy, + pkey_cmac_cleanup, + + 0, 0, + + 0, + pkey_cmac_keygen, + + 0, 0, + + 0, 0, + + 0,0, + + cmac_signctx_init, + cmac_signctx, + + 0,0, + + 0,0, + + 0,0, + + 0,0, + + pkey_cmac_ctrl, + pkey_cmac_ctrl_str + + }; diff --git a/src/lib/libcrypto/cmac/cmac.c b/src/lib/libcrypto/cmac/cmac.c new file mode 100644 index 0000000000..8b72b09681 --- /dev/null +++ b/src/lib/libcrypto/cmac/cmac.c @@ -0,0 +1,308 @@ +/* crypto/cmac/cmac.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include +#include +#include +#include "cryptlib.h" +#include + +#ifdef OPENSSL_FIPS +#include +#endif + +struct CMAC_CTX_st + { + /* Cipher context to use */ + EVP_CIPHER_CTX cctx; + /* Keys k1 and k2 */ + unsigned char k1[EVP_MAX_BLOCK_LENGTH]; + unsigned char k2[EVP_MAX_BLOCK_LENGTH]; + /* Temporary block */ + unsigned char tbl[EVP_MAX_BLOCK_LENGTH]; + /* Last (possibly partial) block */ + unsigned char last_block[EVP_MAX_BLOCK_LENGTH]; + /* Number of bytes in last block: -1 means context not initialised */ + int nlast_block; + }; + + +/* Make temporary keys K1 and K2 */ + +static void make_kn(unsigned char *k1, unsigned char *l, int bl) + { + int i; + /* Shift block to left, including carry */ + for (i = 0; i < bl; i++) + { + k1[i] = l[i] << 1; + if (i < bl - 1 && l[i + 1] & 0x80) + k1[i] |= 1; + } + /* If MSB set fixup with R */ + if (l[0] & 0x80) + k1[bl - 1] ^= bl == 16 ? 0x87 : 0x1b; + } + +CMAC_CTX *CMAC_CTX_new(void) + { + CMAC_CTX *ctx; + ctx = OPENSSL_malloc(sizeof(CMAC_CTX)); + if (!ctx) + return NULL; + EVP_CIPHER_CTX_init(&ctx->cctx); + ctx->nlast_block = -1; + return ctx; + } + +void CMAC_CTX_cleanup(CMAC_CTX *ctx) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->cctx.engine) + { + FIPS_cmac_ctx_cleanup(ctx); + return; + } +#endif + EVP_CIPHER_CTX_cleanup(&ctx->cctx); + OPENSSL_cleanse(ctx->tbl, EVP_MAX_BLOCK_LENGTH); + OPENSSL_cleanse(ctx->k1, EVP_MAX_BLOCK_LENGTH); + OPENSSL_cleanse(ctx->k2, EVP_MAX_BLOCK_LENGTH); + OPENSSL_cleanse(ctx->last_block, EVP_MAX_BLOCK_LENGTH); + ctx->nlast_block = -1; + } + +EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx) + { + return &ctx->cctx; + } + +void CMAC_CTX_free(CMAC_CTX *ctx) + { + CMAC_CTX_cleanup(ctx); + OPENSSL_free(ctx); + } + +int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) + { + int bl; + if (in->nlast_block == -1) + return 0; + if (!EVP_CIPHER_CTX_copy(&out->cctx, &in->cctx)) + return 0; + bl = EVP_CIPHER_CTX_block_size(&in->cctx); + memcpy(out->k1, in->k1, bl); + memcpy(out->k2, in->k2, bl); + memcpy(out->tbl, in->tbl, bl); + memcpy(out->last_block, in->last_block, bl); + out->nlast_block = in->nlast_block; + return 1; + } + +int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, + const EVP_CIPHER *cipher, ENGINE *impl) + { + static unsigned char zero_iv[EVP_MAX_BLOCK_LENGTH]; +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + { + /* If we have an ENGINE need to allow non FIPS */ + if ((impl || ctx->cctx.engine) + && !(ctx->cctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)) + + { + EVPerr(EVP_F_CMAC_INIT, EVP_R_DISABLED_FOR_FIPS); + return 0; + } + /* Other algorithm blocking will be done in FIPS_cmac_init, + * via FIPS_cipherinit(). + */ + if (!impl && !ctx->cctx.engine) + return FIPS_cmac_init(ctx, key, keylen, cipher, NULL); + } +#endif + /* All zeros means restart */ + if (!key && !cipher && !impl && keylen == 0) + { + /* Not initialised */ + if (ctx->nlast_block == -1) + return 0; + if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) + return 0; + memset(ctx->tbl, 0, EVP_CIPHER_CTX_block_size(&ctx->cctx)); + ctx->nlast_block = 0; + return 1; + } + /* Initialiase context */ + if (cipher && !EVP_EncryptInit_ex(&ctx->cctx, cipher, impl, NULL, NULL)) + return 0; + /* Non-NULL key means initialisation complete */ + if (key) + { + int bl; + if (!EVP_CIPHER_CTX_cipher(&ctx->cctx)) + return 0; + if (!EVP_CIPHER_CTX_set_key_length(&ctx->cctx, keylen)) + return 0; + if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, key, zero_iv)) + return 0; + bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); + if (!EVP_Cipher(&ctx->cctx, ctx->tbl, zero_iv, bl)) + return 0; + make_kn(ctx->k1, ctx->tbl, bl); + make_kn(ctx->k2, ctx->k1, bl); + OPENSSL_cleanse(ctx->tbl, bl); + /* Reset context again ready for first data block */ + if (!EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, zero_iv)) + return 0; + /* Zero tbl so resume works */ + memset(ctx->tbl, 0, bl); + ctx->nlast_block = 0; + } + return 1; + } + +int CMAC_Update(CMAC_CTX *ctx, const void *in, size_t dlen) + { + const unsigned char *data = in; + size_t bl; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->cctx.engine) + return FIPS_cmac_update(ctx, in, dlen); +#endif + if (ctx->nlast_block == -1) + return 0; + if (dlen == 0) + return 1; + bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); + /* Copy into partial block if we need to */ + if (ctx->nlast_block > 0) + { + size_t nleft; + nleft = bl - ctx->nlast_block; + if (dlen < nleft) + nleft = dlen; + memcpy(ctx->last_block + ctx->nlast_block, data, nleft); + dlen -= nleft; + ctx->nlast_block += nleft; + /* If no more to process return */ + if (dlen == 0) + return 1; + data += nleft; + /* Else not final block so encrypt it */ + if (!EVP_Cipher(&ctx->cctx, ctx->tbl, ctx->last_block,bl)) + return 0; + } + /* Encrypt all but one of the complete blocks left */ + while(dlen > bl) + { + if (!EVP_Cipher(&ctx->cctx, ctx->tbl, data, bl)) + return 0; + dlen -= bl; + data += bl; + } + /* Copy any data left to last block buffer */ + memcpy(ctx->last_block, data, dlen); + ctx->nlast_block = dlen; + return 1; + + } + +int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen) + { + int i, bl, lb; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->cctx.engine) + return FIPS_cmac_final(ctx, out, poutlen); +#endif + if (ctx->nlast_block == -1) + return 0; + bl = EVP_CIPHER_CTX_block_size(&ctx->cctx); + *poutlen = (size_t)bl; + if (!out) + return 1; + lb = ctx->nlast_block; + /* Is last block complete? */ + if (lb == bl) + { + for (i = 0; i < bl; i++) + out[i] = ctx->last_block[i] ^ ctx->k1[i]; + } + else + { + ctx->last_block[lb] = 0x80; + if (bl - lb > 1) + memset(ctx->last_block + lb + 1, 0, bl - lb - 1); + for (i = 0; i < bl; i++) + out[i] = ctx->last_block[i] ^ ctx->k2[i]; + } + if (!EVP_Cipher(&ctx->cctx, out, out, bl)) + { + OPENSSL_cleanse(out, bl); + return 0; + } + return 1; + } + +int CMAC_resume(CMAC_CTX *ctx) + { + if (ctx->nlast_block == -1) + return 0; + /* The buffer "tbl" containes the last fully encrypted block + * which is the last IV (or all zeroes if no last encrypted block). + * The last block has not been modified since CMAC_final(). + * So reinitliasing using the last decrypted block will allow + * CMAC to continue after calling CMAC_Final(). + */ + return EVP_EncryptInit_ex(&ctx->cctx, NULL, NULL, NULL, ctx->tbl); + } diff --git a/src/lib/libcrypto/cmac/cmac.h b/src/lib/libcrypto/cmac/cmac.h new file mode 100644 index 0000000000..712e92dced --- /dev/null +++ b/src/lib/libcrypto/cmac/cmac.h @@ -0,0 +1,82 @@ +/* crypto/cmac/cmac.h */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + + +#ifndef HEADER_CMAC_H +#define HEADER_CMAC_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Opaque */ +typedef struct CMAC_CTX_st CMAC_CTX; + +CMAC_CTX *CMAC_CTX_new(void); +void CMAC_CTX_cleanup(CMAC_CTX *ctx); +void CMAC_CTX_free(CMAC_CTX *ctx); +EVP_CIPHER_CTX *CMAC_CTX_get0_cipher_ctx(CMAC_CTX *ctx); +int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in); + +int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t keylen, + const EVP_CIPHER *cipher, ENGINE *impl); +int CMAC_Update(CMAC_CTX *ctx, const void *data, size_t dlen); +int CMAC_Final(CMAC_CTX *ctx, unsigned char *out, size_t *poutlen); +int CMAC_resume(CMAC_CTX *ctx); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/src/lib/libcrypto/cms/cms.h b/src/lib/libcrypto/cms/cms.h index 09c45d0412..36994fa6a2 100644 --- a/src/lib/libcrypto/cms/cms.h +++ b/src/lib/libcrypto/cms/cms.h @@ -111,6 +111,7 @@ DECLARE_ASN1_PRINT_FUNCTION(CMS_ContentInfo) #define CMS_PARTIAL 0x4000 #define CMS_REUSE_DIGEST 0x8000 #define CMS_USE_KEYID 0x10000 +#define CMS_DEBUG_DECRYPT 0x20000 const ASN1_OBJECT *CMS_get0_type(CMS_ContentInfo *cms); @@ -184,6 +185,8 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert); int CMS_decrypt_set1_key(CMS_ContentInfo *cms, unsigned char *key, size_t keylen, unsigned char *id, size_t idlen); +int CMS_decrypt_set1_password(CMS_ContentInfo *cms, + unsigned char *pass, ossl_ssize_t passlen); STACK_OF(CMS_RecipientInfo) *CMS_get0_RecipientInfos(CMS_ContentInfo *cms); int CMS_RecipientInfo_type(CMS_RecipientInfo *ri); @@ -219,6 +222,16 @@ int CMS_RecipientInfo_set0_key(CMS_RecipientInfo *ri, int CMS_RecipientInfo_kekri_id_cmp(CMS_RecipientInfo *ri, const unsigned char *id, size_t idlen); +int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, + unsigned char *pass, + ossl_ssize_t passlen); + +CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, + int iter, int wrap_nid, int pbe_nid, + unsigned char *pass, + ossl_ssize_t passlen, + const EVP_CIPHER *kekciph); + int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri); int CMS_uncompress(CMS_ContentInfo *cms, BIO *dcont, BIO *out, @@ -330,6 +343,7 @@ void ERR_load_CMS_strings(void); #define CMS_F_CHECK_CONTENT 99 #define CMS_F_CMS_ADD0_CERT 164 #define CMS_F_CMS_ADD0_RECIPIENT_KEY 100 +#define CMS_F_CMS_ADD0_RECIPIENT_PASSWORD 165 #define CMS_F_CMS_ADD1_RECEIPTREQUEST 158 #define CMS_F_CMS_ADD1_RECIPIENT_CERT 101 #define CMS_F_CMS_ADD1_SIGNER 102 @@ -344,6 +358,7 @@ void ERR_load_CMS_strings(void); #define CMS_F_CMS_DATAINIT 111 #define CMS_F_CMS_DECRYPT 112 #define CMS_F_CMS_DECRYPT_SET1_KEY 113 +#define CMS_F_CMS_DECRYPT_SET1_PASSWORD 166 #define CMS_F_CMS_DECRYPT_SET1_PKEY 114 #define CMS_F_CMS_DIGESTALGORITHM_FIND_CTX 115 #define CMS_F_CMS_DIGESTALGORITHM_INIT_BIO 116 @@ -378,7 +393,9 @@ void ERR_load_CMS_strings(void); #define CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT 141 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS 142 #define CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID 143 +#define CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT 167 #define CMS_F_CMS_RECIPIENTINFO_SET0_KEY 144 +#define CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD 168 #define CMS_F_CMS_RECIPIENTINFO_SET0_PKEY 145 #define CMS_F_CMS_SET1_SIGNERIDENTIFIER 146 #define CMS_F_CMS_SET_DETACHED 147 @@ -419,6 +436,7 @@ void ERR_load_CMS_strings(void); #define CMS_R_ERROR_SETTING_KEY 115 #define CMS_R_ERROR_SETTING_RECIPIENTINFO 116 #define CMS_R_INVALID_ENCRYPTED_KEY_LENGTH 117 +#define CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER 176 #define CMS_R_INVALID_KEY_LENGTH 118 #define CMS_R_MD_BIO_INIT_ERROR 119 #define CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH 120 @@ -431,6 +449,7 @@ void ERR_load_CMS_strings(void); #define CMS_R_NOT_ENCRYPTED_DATA 122 #define CMS_R_NOT_KEK 123 #define CMS_R_NOT_KEY_TRANSPORT 124 +#define CMS_R_NOT_PWRI 177 #define CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE 125 #define CMS_R_NO_CIPHER 126 #define CMS_R_NO_CONTENT 127 @@ -443,6 +462,7 @@ void ERR_load_CMS_strings(void); #define CMS_R_NO_MATCHING_RECIPIENT 132 #define CMS_R_NO_MATCHING_SIGNATURE 166 #define CMS_R_NO_MSGSIGDIGEST 167 +#define CMS_R_NO_PASSWORD 178 #define CMS_R_NO_PRIVATE_KEY 133 #define CMS_R_NO_PUBLIC_KEY 134 #define CMS_R_NO_RECEIPT_REQUEST 168 @@ -466,10 +486,12 @@ void ERR_load_CMS_strings(void); #define CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM 151 #define CMS_R_UNSUPPORTED_CONTENT_TYPE 152 #define CMS_R_UNSUPPORTED_KEK_ALGORITHM 153 +#define CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM 179 #define CMS_R_UNSUPPORTED_RECIPIENT_TYPE 154 #define CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE 155 #define CMS_R_UNSUPPORTED_TYPE 156 #define CMS_R_UNWRAP_ERROR 157 +#define CMS_R_UNWRAP_FAILURE 180 #define CMS_R_VERIFICATION_FAILURE 158 #define CMS_R_WRAP_ERROR 159 diff --git a/src/lib/libcrypto/cms/cms_asn1.c b/src/lib/libcrypto/cms/cms_asn1.c index fcba4dcbcc..cfe67fb6c1 100644 --- a/src/lib/libcrypto/cms/cms_asn1.c +++ b/src/lib/libcrypto/cms/cms_asn1.c @@ -237,6 +237,15 @@ static int cms_ri_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, OPENSSL_free(kekri->key); } } + else if (ri->type == CMS_RECIPINFO_PASS) + { + CMS_PasswordRecipientInfo *pwri = ri->d.pwri; + if (pwri->pass) + { + OPENSSL_cleanse(pwri->pass, pwri->passlen); + OPENSSL_free(pwri->pass); + } + } } return 1; } diff --git a/src/lib/libcrypto/cms/cms_enc.c b/src/lib/libcrypto/cms/cms_enc.c index bab26235bd..f873ce3794 100644 --- a/src/lib/libcrypto/cms/cms_enc.c +++ b/src/lib/libcrypto/cms/cms_enc.c @@ -73,6 +73,8 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) const EVP_CIPHER *ciph; X509_ALGOR *calg = ec->contentEncryptionAlgorithm; unsigned char iv[EVP_MAX_IV_LENGTH], *piv = NULL; + unsigned char *tkey = NULL; + size_t tkeylen; int ok = 0; @@ -137,32 +139,57 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); goto err; } - - - if (enc && !ec->key) + tkeylen = EVP_CIPHER_CTX_key_length(ctx); + /* Generate random session key */ + if (!enc || !ec->key) { - /* Generate random key */ - if (!ec->keylen) - ec->keylen = EVP_CIPHER_CTX_key_length(ctx); - ec->key = OPENSSL_malloc(ec->keylen); - if (!ec->key) + tkey = OPENSSL_malloc(tkeylen); + if (!tkey) { CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, ERR_R_MALLOC_FAILURE); goto err; } - if (EVP_CIPHER_CTX_rand_key(ctx, ec->key) <= 0) + if (EVP_CIPHER_CTX_rand_key(ctx, tkey) <= 0) goto err; - keep_key = 1; } - else if (ec->keylen != (unsigned int)EVP_CIPHER_CTX_key_length(ctx)) + + if (!ec->key) + { + ec->key = tkey; + ec->keylen = tkeylen; + tkey = NULL; + if (enc) + keep_key = 1; + else + ERR_clear_error(); + + } + + if (ec->keylen != tkeylen) { /* If necessary set key length */ if (EVP_CIPHER_CTX_set_key_length(ctx, ec->keylen) <= 0) { - CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, - CMS_R_INVALID_KEY_LENGTH); - goto err; + /* Only reveal failure if debugging so we don't + * leak information which may be useful in MMA. + */ + if (enc || ec->debug) + { + CMSerr(CMS_F_CMS_ENCRYPTEDCONTENT_INIT_BIO, + CMS_R_INVALID_KEY_LENGTH); + goto err; + } + else + { + /* Use random key */ + OPENSSL_cleanse(ec->key, ec->keylen); + OPENSSL_free(ec->key); + ec->key = tkey; + ec->keylen = tkeylen; + tkey = NULL; + ERR_clear_error(); + } } } @@ -198,6 +225,11 @@ BIO *cms_EncryptedContent_init_bio(CMS_EncryptedContentInfo *ec) OPENSSL_free(ec->key); ec->key = NULL; } + if (tkey) + { + OPENSSL_cleanse(tkey, tkeylen); + OPENSSL_free(tkey); + } if (ok) return b; BIO_free(b); diff --git a/src/lib/libcrypto/cms/cms_env.c b/src/lib/libcrypto/cms/cms_env.c index b3237d4b94..be20b1c024 100644 --- a/src/lib/libcrypto/cms/cms_env.c +++ b/src/lib/libcrypto/cms/cms_env.c @@ -65,14 +65,13 @@ /* CMS EnvelopedData Utilities */ DECLARE_ASN1_ITEM(CMS_EnvelopedData) -DECLARE_ASN1_ITEM(CMS_RecipientInfo) DECLARE_ASN1_ITEM(CMS_KeyTransRecipientInfo) DECLARE_ASN1_ITEM(CMS_KEKRecipientInfo) DECLARE_ASN1_ITEM(CMS_OtherKeyAttribute) DECLARE_STACK_OF(CMS_RecipientInfo) -static CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) +CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms) { if (OBJ_obj2nid(cms->contentType) != NID_pkcs7_enveloped) { @@ -371,6 +370,8 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, unsigned char *ek = NULL; size_t eklen; int ret = 0; + CMS_EncryptedContentInfo *ec; + ec = cms->d.envelopedData->encryptedContentInfo; if (ktri->pkey == NULL) { @@ -417,8 +418,14 @@ static int cms_RecipientInfo_ktri_decrypt(CMS_ContentInfo *cms, ret = 1; - cms->d.envelopedData->encryptedContentInfo->key = ek; - cms->d.envelopedData->encryptedContentInfo->keylen = eklen; + if (ec->key) + { + OPENSSL_cleanse(ec->key, ec->keylen); + OPENSSL_free(ec->key); + } + + ec->key = ek; + ec->keylen = eklen; err: if (pctx) @@ -786,6 +793,9 @@ int CMS_RecipientInfo_decrypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri) case CMS_RECIPINFO_KEK: return cms_RecipientInfo_kekri_decrypt(cms, ri); + case CMS_RECIPINFO_PASS: + return cms_RecipientInfo_pwri_crypt(cms, ri, 0); + default: CMSerr(CMS_F_CMS_RECIPIENTINFO_DECRYPT, CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE); @@ -829,6 +839,10 @@ BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms) r = cms_RecipientInfo_kekri_encrypt(cms, ri); break; + case CMS_RECIPINFO_PASS: + r = cms_RecipientInfo_pwri_crypt(cms, ri, 1); + break; + default: CMSerr(CMS_F_CMS_ENVELOPEDDATA_INIT_BIO, CMS_R_UNSUPPORTED_RECIPIENT_TYPE); diff --git a/src/lib/libcrypto/cms/cms_err.c b/src/lib/libcrypto/cms/cms_err.c index ff7b0309e5..8330ead7ed 100644 --- a/src/lib/libcrypto/cms/cms_err.c +++ b/src/lib/libcrypto/cms/cms_err.c @@ -1,6 +1,6 @@ /* crypto/cms/cms_err.c */ /* ==================================================================== - * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2009 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -73,6 +73,7 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CHECK_CONTENT), "CHECK_CONTENT"}, {ERR_FUNC(CMS_F_CMS_ADD0_CERT), "CMS_add0_cert"}, {ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_KEY), "CMS_add0_recipient_key"}, +{ERR_FUNC(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD), "CMS_add0_recipient_password"}, {ERR_FUNC(CMS_F_CMS_ADD1_RECEIPTREQUEST), "CMS_add1_ReceiptRequest"}, {ERR_FUNC(CMS_F_CMS_ADD1_RECIPIENT_CERT), "CMS_add1_recipient_cert"}, {ERR_FUNC(CMS_F_CMS_ADD1_SIGNER), "CMS_add1_signer"}, @@ -87,6 +88,7 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CMS_DATAINIT), "CMS_dataInit"}, {ERR_FUNC(CMS_F_CMS_DECRYPT), "CMS_decrypt"}, {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_KEY), "CMS_decrypt_set1_key"}, +{ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PASSWORD), "CMS_decrypt_set1_password"}, {ERR_FUNC(CMS_F_CMS_DECRYPT_SET1_PKEY), "CMS_decrypt_set1_pkey"}, {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_FIND_CTX), "cms_DigestAlgorithm_find_ctx"}, {ERR_FUNC(CMS_F_CMS_DIGESTALGORITHM_INIT_BIO), "cms_DigestAlgorithm_init_bio"}, @@ -105,7 +107,7 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CMS_GET0_CERTIFICATE_CHOICES), "CMS_GET0_CERTIFICATE_CHOICES"}, {ERR_FUNC(CMS_F_CMS_GET0_CONTENT), "CMS_get0_content"}, {ERR_FUNC(CMS_F_CMS_GET0_ECONTENT_TYPE), "CMS_GET0_ECONTENT_TYPE"}, -{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "CMS_GET0_ENVELOPED"}, +{ERR_FUNC(CMS_F_CMS_GET0_ENVELOPED), "cms_get0_enveloped"}, {ERR_FUNC(CMS_F_CMS_GET0_REVOCATION_CHOICES), "CMS_GET0_REVOCATION_CHOICES"}, {ERR_FUNC(CMS_F_CMS_GET0_SIGNED), "CMS_GET0_SIGNED"}, {ERR_FUNC(CMS_F_CMS_MSGSIGDIGEST_ADD1), "cms_msgSigDigest_add1"}, @@ -121,7 +123,9 @@ static ERR_STRING_DATA CMS_str_functs[]= {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_ENCRYPT), "CMS_RECIPIENTINFO_KTRI_ENCRYPT"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_ALGS), "CMS_RecipientInfo_ktri_get0_algs"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_KTRI_GET0_SIGNER_ID), "CMS_RecipientInfo_ktri_get0_signer_id"}, +{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT), "cms_RecipientInfo_pwri_crypt"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_KEY), "CMS_RecipientInfo_set0_key"}, +{ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD), "CMS_RecipientInfo_set0_password"}, {ERR_FUNC(CMS_F_CMS_RECIPIENTINFO_SET0_PKEY), "CMS_RecipientInfo_set0_pkey"}, {ERR_FUNC(CMS_F_CMS_SET1_SIGNERIDENTIFIER), "cms_set1_SignerIdentifier"}, {ERR_FUNC(CMS_F_CMS_SET_DETACHED), "CMS_set_detached"}, @@ -165,6 +169,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_ERROR_SETTING_KEY) ,"error setting key"}, {ERR_REASON(CMS_R_ERROR_SETTING_RECIPIENTINFO),"error setting recipientinfo"}, {ERR_REASON(CMS_R_INVALID_ENCRYPTED_KEY_LENGTH),"invalid encrypted key length"}, +{ERR_REASON(CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER),"invalid key encryption parameter"}, {ERR_REASON(CMS_R_INVALID_KEY_LENGTH) ,"invalid key length"}, {ERR_REASON(CMS_R_MD_BIO_INIT_ERROR) ,"md bio init error"}, {ERR_REASON(CMS_R_MESSAGEDIGEST_ATTRIBUTE_WRONG_LENGTH),"messagedigest attribute wrong length"}, @@ -177,6 +182,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_NOT_ENCRYPTED_DATA) ,"not encrypted data"}, {ERR_REASON(CMS_R_NOT_KEK) ,"not kek"}, {ERR_REASON(CMS_R_NOT_KEY_TRANSPORT) ,"not key transport"}, +{ERR_REASON(CMS_R_NOT_PWRI) ,"not pwri"}, {ERR_REASON(CMS_R_NOT_SUPPORTED_FOR_THIS_KEY_TYPE),"not supported for this key type"}, {ERR_REASON(CMS_R_NO_CIPHER) ,"no cipher"}, {ERR_REASON(CMS_R_NO_CONTENT) ,"no content"}, @@ -189,6 +195,7 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_NO_MATCHING_RECIPIENT) ,"no matching recipient"}, {ERR_REASON(CMS_R_NO_MATCHING_SIGNATURE) ,"no matching signature"}, {ERR_REASON(CMS_R_NO_MSGSIGDIGEST) ,"no msgsigdigest"}, +{ERR_REASON(CMS_R_NO_PASSWORD) ,"no password"}, {ERR_REASON(CMS_R_NO_PRIVATE_KEY) ,"no private key"}, {ERR_REASON(CMS_R_NO_PUBLIC_KEY) ,"no public key"}, {ERR_REASON(CMS_R_NO_RECEIPT_REQUEST) ,"no receipt request"}, @@ -212,10 +219,12 @@ static ERR_STRING_DATA CMS_str_reasons[]= {ERR_REASON(CMS_R_UNSUPPORTED_COMPRESSION_ALGORITHM),"unsupported compression algorithm"}, {ERR_REASON(CMS_R_UNSUPPORTED_CONTENT_TYPE),"unsupported content type"}, {ERR_REASON(CMS_R_UNSUPPORTED_KEK_ALGORITHM),"unsupported kek algorithm"}, +{ERR_REASON(CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM),"unsupported key encryption algorithm"}, {ERR_REASON(CMS_R_UNSUPPORTED_RECIPIENT_TYPE),"unsupported recipient type"}, {ERR_REASON(CMS_R_UNSUPPORTED_RECPIENTINFO_TYPE),"unsupported recpientinfo type"}, {ERR_REASON(CMS_R_UNSUPPORTED_TYPE) ,"unsupported type"}, {ERR_REASON(CMS_R_UNWRAP_ERROR) ,"unwrap error"}, +{ERR_REASON(CMS_R_UNWRAP_FAILURE) ,"unwrap failure"}, {ERR_REASON(CMS_R_VERIFICATION_FAILURE) ,"verification failure"}, {ERR_REASON(CMS_R_WRAP_ERROR) ,"wrap error"}, {0,NULL} diff --git a/src/lib/libcrypto/cms/cms_lcl.h b/src/lib/libcrypto/cms/cms_lcl.h index c8ecfa724a..a9f9730157 100644 --- a/src/lib/libcrypto/cms/cms_lcl.h +++ b/src/lib/libcrypto/cms/cms_lcl.h @@ -175,6 +175,8 @@ struct CMS_EncryptedContentInfo_st const EVP_CIPHER *cipher; unsigned char *key; size_t keylen; + /* Set to 1 if we are debugging decrypt and don't fake keys for MMA */ + int debug; }; struct CMS_RecipientInfo_st @@ -273,6 +275,9 @@ struct CMS_PasswordRecipientInfo_st X509_ALGOR *keyDerivationAlgorithm; X509_ALGOR *keyEncryptionAlgorithm; ASN1_OCTET_STRING *encryptedKey; + /* Extra info: password to use */ + unsigned char *pass; + size_t passlen; }; struct CMS_OtherRecipientInfo_st @@ -411,6 +416,8 @@ DECLARE_ASN1_ITEM(CMS_SignerInfo) DECLARE_ASN1_ITEM(CMS_IssuerAndSerialNumber) DECLARE_ASN1_ITEM(CMS_Attributes_Sign) DECLARE_ASN1_ITEM(CMS_Attributes_Verify) +DECLARE_ASN1_ITEM(CMS_RecipientInfo) +DECLARE_ASN1_ITEM(CMS_PasswordRecipientInfo) DECLARE_ASN1_ALLOC_FUNCTIONS(CMS_IssuerAndSerialNumber) #define CMS_SIGNERINFO_ISSUER_SERIAL 0 @@ -454,6 +461,11 @@ int cms_msgSigDigest_add1(CMS_SignerInfo *dest, CMS_SignerInfo *src); ASN1_OCTET_STRING *cms_encode_Receipt(CMS_SignerInfo *si); BIO *cms_EnvelopedData_init_bio(CMS_ContentInfo *cms); +CMS_EnvelopedData *cms_get0_enveloped(CMS_ContentInfo *cms); + +/* PWRI routines */ +int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, + int en_de); #ifdef __cplusplus } diff --git a/src/lib/libcrypto/cms/cms_lib.c b/src/lib/libcrypto/cms/cms_lib.c index d00fe0f87b..f88e8f3b52 100644 --- a/src/lib/libcrypto/cms/cms_lib.c +++ b/src/lib/libcrypto/cms/cms_lib.c @@ -412,8 +412,7 @@ int cms_DigestAlgorithm_find_ctx(EVP_MD_CTX *mctx, BIO *chain, */ || EVP_MD_pkey_type(EVP_MD_CTX_md(mtmp)) == nid) { - EVP_MD_CTX_copy_ex(mctx, mtmp); - return 1; + return EVP_MD_CTX_copy_ex(mctx, mtmp); } chain = BIO_next(chain); } diff --git a/src/lib/libcrypto/cms/cms_pwri.c b/src/lib/libcrypto/cms/cms_pwri.c new file mode 100644 index 0000000000..b79612a12d --- /dev/null +++ b/src/lib/libcrypto/cms/cms_pwri.c @@ -0,0 +1,454 @@ +/* crypto/cms/cms_pwri.c */ +/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL + * project. + */ +/* ==================================================================== + * Copyright (c) 2009 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include "cryptlib.h" +#include +#include +#include +#include +#include +#include +#include +#include "cms_lcl.h" +#include "asn1_locl.h" + +int CMS_RecipientInfo_set0_password(CMS_RecipientInfo *ri, + unsigned char *pass, ossl_ssize_t passlen) + { + CMS_PasswordRecipientInfo *pwri; + if (ri->type != CMS_RECIPINFO_PASS) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_SET0_PASSWORD, CMS_R_NOT_PWRI); + return 0; + } + + pwri = ri->d.pwri; + pwri->pass = pass; + if (pass && passlen < 0) + passlen = strlen((char *)pass); + pwri->passlen = passlen; + return 1; + } + +CMS_RecipientInfo *CMS_add0_recipient_password(CMS_ContentInfo *cms, + int iter, int wrap_nid, int pbe_nid, + unsigned char *pass, + ossl_ssize_t passlen, + const EVP_CIPHER *kekciph) + { + CMS_RecipientInfo *ri = NULL; + CMS_EnvelopedData *env; + CMS_PasswordRecipientInfo *pwri; + EVP_CIPHER_CTX ctx; + X509_ALGOR *encalg = NULL; + unsigned char iv[EVP_MAX_IV_LENGTH]; + int ivlen; + env = cms_get0_enveloped(cms); + if (!env) + goto err; + + if (wrap_nid <= 0) + wrap_nid = NID_id_alg_PWRI_KEK; + + if (pbe_nid <= 0) + pbe_nid = NID_id_pbkdf2; + + /* Get from enveloped data */ + if (kekciph == NULL) + kekciph = env->encryptedContentInfo->cipher; + + if (kekciph == NULL) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, CMS_R_NO_CIPHER); + return NULL; + } + if (wrap_nid != NID_id_alg_PWRI_KEK) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); + return NULL; + } + + /* Setup algorithm identifier for cipher */ + encalg = X509_ALGOR_new(); + EVP_CIPHER_CTX_init(&ctx); + + if (EVP_EncryptInit_ex(&ctx, kekciph, NULL, NULL, NULL) <= 0) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_EVP_LIB); + goto err; + } + + ivlen = EVP_CIPHER_CTX_iv_length(&ctx); + + if (ivlen > 0) + { + if (RAND_pseudo_bytes(iv, ivlen) <= 0) + goto err; + if (EVP_EncryptInit_ex(&ctx, NULL, NULL, NULL, iv) <= 0) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + ERR_R_EVP_LIB); + goto err; + } + encalg->parameter = ASN1_TYPE_new(); + if (!encalg->parameter) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + ERR_R_MALLOC_FAILURE); + goto err; + } + if (EVP_CIPHER_param_to_asn1(&ctx, encalg->parameter) <= 0) + { + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, + CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); + goto err; + } + } + + + encalg->algorithm = OBJ_nid2obj(EVP_CIPHER_CTX_type(&ctx)); + + EVP_CIPHER_CTX_cleanup(&ctx); + + /* Initialize recipient info */ + ri = M_ASN1_new_of(CMS_RecipientInfo); + if (!ri) + goto merr; + + ri->d.pwri = M_ASN1_new_of(CMS_PasswordRecipientInfo); + if (!ri->d.pwri) + goto merr; + ri->type = CMS_RECIPINFO_PASS; + + pwri = ri->d.pwri; + /* Since this is overwritten, free up empty structure already there */ + X509_ALGOR_free(pwri->keyEncryptionAlgorithm); + pwri->keyEncryptionAlgorithm = X509_ALGOR_new(); + if (!pwri->keyEncryptionAlgorithm) + goto merr; + pwri->keyEncryptionAlgorithm->algorithm = OBJ_nid2obj(wrap_nid); + pwri->keyEncryptionAlgorithm->parameter = ASN1_TYPE_new(); + if (!pwri->keyEncryptionAlgorithm->parameter) + goto merr; + + if(!ASN1_item_pack(encalg, ASN1_ITEM_rptr(X509_ALGOR), + &pwri->keyEncryptionAlgorithm->parameter->value.sequence)) + goto merr; + pwri->keyEncryptionAlgorithm->parameter->type = V_ASN1_SEQUENCE; + + X509_ALGOR_free(encalg); + encalg = NULL; + + /* Setup PBE algorithm */ + + pwri->keyDerivationAlgorithm = PKCS5_pbkdf2_set(iter, NULL, 0, -1, -1); + + if (!pwri->keyDerivationAlgorithm) + goto err; + + CMS_RecipientInfo_set0_password(ri, pass, passlen); + pwri->version = 0; + + if (!sk_CMS_RecipientInfo_push(env->recipientInfos, ri)) + goto merr; + + return ri; + + merr: + CMSerr(CMS_F_CMS_ADD0_RECIPIENT_PASSWORD, ERR_R_MALLOC_FAILURE); + err: + EVP_CIPHER_CTX_cleanup(&ctx); + if (ri) + M_ASN1_free_of(ri, CMS_RecipientInfo); + if (encalg) + X509_ALGOR_free(encalg); + return NULL; + + } + +/* This is an implementation of the key wrapping mechanism in RFC3211, + * at some point this should go into EVP. + */ + +static int kek_unwrap_key(unsigned char *out, size_t *outlen, + const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) + { + size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); + unsigned char *tmp; + int outl, rv = 0; + if (inlen < 2 * blocklen) + { + /* too small */ + return 0; + } + if (inlen % blocklen) + { + /* Invalid size */ + return 0; + } + tmp = OPENSSL_malloc(inlen); + /* setup IV by decrypting last two blocks */ + EVP_DecryptUpdate(ctx, tmp + inlen - 2 * blocklen, &outl, + in + inlen - 2 * blocklen, blocklen * 2); + /* Do a decrypt of last decrypted block to set IV to correct value + * output it to start of buffer so we don't corrupt decrypted block + * this works because buffer is at least two block lengths long. + */ + EVP_DecryptUpdate(ctx, tmp, &outl, + tmp + inlen - blocklen, blocklen); + /* Can now decrypt first n - 1 blocks */ + EVP_DecryptUpdate(ctx, tmp, &outl, in, inlen - blocklen); + + /* Reset IV to original value */ + EVP_DecryptInit_ex(ctx, NULL, NULL, NULL, NULL); + /* Decrypt again */ + EVP_DecryptUpdate(ctx, tmp, &outl, tmp, inlen); + /* Check check bytes */ + if (((tmp[1] ^ tmp[4]) & (tmp[2] ^ tmp[5]) & (tmp[3] ^ tmp[6])) != 0xff) + { + /* Check byte failure */ + goto err; + } + if (inlen < (size_t)(tmp[0] - 4 )) + { + /* Invalid length value */ + goto err; + } + *outlen = (size_t)tmp[0]; + memcpy(out, tmp + 4, *outlen); + rv = 1; + err: + OPENSSL_cleanse(tmp, inlen); + OPENSSL_free(tmp); + return rv; + + } + +static int kek_wrap_key(unsigned char *out, size_t *outlen, + const unsigned char *in, size_t inlen, EVP_CIPHER_CTX *ctx) + { + size_t blocklen = EVP_CIPHER_CTX_block_size(ctx); + size_t olen; + int dummy; + /* First decide length of output buffer: need header and round up to + * multiple of block length. + */ + olen = (inlen + 4 + blocklen - 1)/blocklen; + olen *= blocklen; + if (olen < 2 * blocklen) + { + /* Key too small */ + return 0; + } + if (inlen > 0xFF) + { + /* Key too large */ + return 0; + } + if (out) + { + /* Set header */ + out[0] = (unsigned char)inlen; + out[1] = in[0] ^ 0xFF; + out[2] = in[1] ^ 0xFF; + out[3] = in[2] ^ 0xFF; + memcpy(out + 4, in, inlen); + /* Add random padding to end */ + if (olen > inlen + 4) + RAND_pseudo_bytes(out + 4 + inlen, olen - 4 - inlen); + /* Encrypt twice */ + EVP_EncryptUpdate(ctx, out, &dummy, out, olen); + EVP_EncryptUpdate(ctx, out, &dummy, out, olen); + } + + *outlen = olen; + + return 1; + } + +/* Encrypt/Decrypt content key in PWRI recipient info */ + +int cms_RecipientInfo_pwri_crypt(CMS_ContentInfo *cms, CMS_RecipientInfo *ri, + int en_de) + { + CMS_EncryptedContentInfo *ec; + CMS_PasswordRecipientInfo *pwri; + const unsigned char *p = NULL; + int plen; + int r = 0; + X509_ALGOR *algtmp, *kekalg = NULL; + EVP_CIPHER_CTX kekctx; + const EVP_CIPHER *kekcipher; + unsigned char *key = NULL; + size_t keylen; + + ec = cms->d.envelopedData->encryptedContentInfo; + + pwri = ri->d.pwri; + EVP_CIPHER_CTX_init(&kekctx); + + if (!pwri->pass) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, CMS_R_NO_PASSWORD); + return 0; + } + algtmp = pwri->keyEncryptionAlgorithm; + + if (!algtmp || OBJ_obj2nid(algtmp->algorithm) != NID_id_alg_PWRI_KEK) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_UNSUPPORTED_KEY_ENCRYPTION_ALGORITHM); + return 0; + } + + if (algtmp->parameter->type == V_ASN1_SEQUENCE) + { + p = algtmp->parameter->value.sequence->data; + plen = algtmp->parameter->value.sequence->length; + kekalg = d2i_X509_ALGOR(NULL, &p, plen); + } + if (kekalg == NULL) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_INVALID_KEY_ENCRYPTION_PARAMETER); + return 0; + } + + kekcipher = EVP_get_cipherbyobj(kekalg->algorithm); + + if(!kekcipher) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_UNKNOWN_CIPHER); + goto err; + } + + /* Fixup cipher based on AlgorithmIdentifier to set IV etc */ + if (!EVP_CipherInit_ex(&kekctx, kekcipher, NULL, NULL, NULL, en_de)) + goto err; + EVP_CIPHER_CTX_set_padding(&kekctx, 0); + if(EVP_CIPHER_asn1_to_param(&kekctx, kekalg->parameter) < 0) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_CIPHER_PARAMETER_INITIALISATION_ERROR); + goto err; + } + + algtmp = pwri->keyDerivationAlgorithm; + + /* Finish password based key derivation to setup key in "ctx" */ + + if (EVP_PBE_CipherInit(algtmp->algorithm, + (char *)pwri->pass, pwri->passlen, + algtmp->parameter, &kekctx, en_de) < 0) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, ERR_R_EVP_LIB); + goto err; + } + + /* Finally wrap/unwrap the key */ + + if (en_de) + { + + if (!kek_wrap_key(NULL, &keylen, ec->key, ec->keylen, &kekctx)) + goto err; + + key = OPENSSL_malloc(keylen); + + if (!key) + goto err; + + if (!kek_wrap_key(key, &keylen, ec->key, ec->keylen, &kekctx)) + goto err; + pwri->encryptedKey->data = key; + pwri->encryptedKey->length = keylen; + } + else + { + key = OPENSSL_malloc(pwri->encryptedKey->length); + + if (!key) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + ERR_R_MALLOC_FAILURE); + goto err; + } + if (!kek_unwrap_key(key, &keylen, + pwri->encryptedKey->data, + pwri->encryptedKey->length, &kekctx)) + { + CMSerr(CMS_F_CMS_RECIPIENTINFO_PWRI_CRYPT, + CMS_R_UNWRAP_FAILURE); + goto err; + } + + ec->key = key; + ec->keylen = keylen; + + } + + r = 1; + + err: + + EVP_CIPHER_CTX_cleanup(&kekctx); + + if (!r && key) + OPENSSL_free(key); + X509_ALGOR_free(kekalg); + + return r; + + } diff --git a/src/lib/libcrypto/cms/cms_sd.c b/src/lib/libcrypto/cms/cms_sd.c index e3192b9c57..77fbd13596 100644 --- a/src/lib/libcrypto/cms/cms_sd.c +++ b/src/lib/libcrypto/cms/cms_sd.c @@ -641,7 +641,8 @@ static int cms_SignerInfo_content_sign(CMS_ContentInfo *cms, cms->d.signedData->encapContentInfo->eContentType; unsigned char md[EVP_MAX_MD_SIZE]; unsigned int mdlen; - EVP_DigestFinal_ex(&mctx, md, &mdlen); + if (!EVP_DigestFinal_ex(&mctx, md, &mdlen)) + goto err; if (!CMS_signed_add1_attr_by_NID(si, NID_pkcs9_messageDigest, V_ASN1_OCTET_STRING, md, mdlen)) diff --git a/src/lib/libcrypto/cms/cms_smime.c b/src/lib/libcrypto/cms/cms_smime.c index 4a799eb897..8c56e3a852 100644 --- a/src/lib/libcrypto/cms/cms_smime.c +++ b/src/lib/libcrypto/cms/cms_smime.c @@ -611,7 +611,10 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert) STACK_OF(CMS_RecipientInfo) *ris; CMS_RecipientInfo *ri; int i, r; + int debug = 0; ris = CMS_get0_RecipientInfos(cms); + if (ris) + debug = cms->d.envelopedData->encryptedContentInfo->debug; for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++) { ri = sk_CMS_RecipientInfo_value(ris, i); @@ -625,17 +628,38 @@ int CMS_decrypt_set1_pkey(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert) CMS_RecipientInfo_set0_pkey(ri, pk); r = CMS_RecipientInfo_decrypt(cms, ri); CMS_RecipientInfo_set0_pkey(ri, NULL); - if (r > 0) - return 1; if (cert) { + /* If not debugging clear any error and + * return success to avoid leaking of + * information useful to MMA + */ + if (!debug) + { + ERR_clear_error(); + return 1; + } + if (r > 0) + return 1; CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY, CMS_R_DECRYPT_ERROR); return 0; } - ERR_clear_error(); + /* If no cert and not debugging don't leave loop + * after first successful decrypt. Always attempt + * to decrypt all recipients to avoid leaking timing + * of a successful decrypt. + */ + else if (r > 0 && debug) + return 1; } } + /* If no cert and not debugging always return success */ + if (!cert && !debug) + { + ERR_clear_error(); + return 1; + } CMSerr(CMS_F_CMS_DECRYPT_SET1_PKEY, CMS_R_NO_MATCHING_RECIPIENT); return 0; @@ -680,6 +704,30 @@ int CMS_decrypt_set1_key(CMS_ContentInfo *cms, return 0; } + +int CMS_decrypt_set1_password(CMS_ContentInfo *cms, + unsigned char *pass, ossl_ssize_t passlen) + { + STACK_OF(CMS_RecipientInfo) *ris; + CMS_RecipientInfo *ri; + int i, r; + ris = CMS_get0_RecipientInfos(cms); + for (i = 0; i < sk_CMS_RecipientInfo_num(ris); i++) + { + ri = sk_CMS_RecipientInfo_value(ris, i); + if (CMS_RecipientInfo_type(ri) != CMS_RECIPINFO_PASS) + continue; + CMS_RecipientInfo_set0_password(ri, pass, passlen); + r = CMS_RecipientInfo_decrypt(cms, ri); + CMS_RecipientInfo_set0_password(ri, NULL, 0); + if (r > 0) + return 1; + } + + CMSerr(CMS_F_CMS_DECRYPT_SET1_PASSWORD, CMS_R_NO_MATCHING_RECIPIENT); + return 0; + + } int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert, BIO *dcont, BIO *out, @@ -694,9 +742,14 @@ int CMS_decrypt(CMS_ContentInfo *cms, EVP_PKEY *pk, X509 *cert, } if (!dcont && !check_content(cms)) return 0; + if (flags & CMS_DEBUG_DECRYPT) + cms->d.envelopedData->encryptedContentInfo->debug = 1; + else + cms->d.envelopedData->encryptedContentInfo->debug = 0; + if (!pk && !cert && !dcont && !out) + return 1; if (pk && !CMS_decrypt_set1_pkey(cms, pk, cert)) return 0; - cont = CMS_dataInit(cms, dcont); if (!cont) return 0; diff --git a/src/lib/libcrypto/comp/c_rle.c b/src/lib/libcrypto/comp/c_rle.c index 18bceae51e..47dfb67fbd 100644 --- a/src/lib/libcrypto/comp/c_rle.c +++ b/src/lib/libcrypto/comp/c_rle.c @@ -30,7 +30,7 @@ static int rle_compress_block(COMP_CTX *ctx, unsigned char *out, { /* int i; */ - if (olen < (ilen+1)) + if (ilen == 0 || olen < (ilen-1)) { /* ZZZZZZZZZZZZZZZZZZZZZZ */ return(-1); @@ -46,7 +46,7 @@ static int rle_expand_block(COMP_CTX *ctx, unsigned char *out, { int i; - if (ilen == 0 || olen < (ilen-1)) + if (olen < (ilen-1)) { /* ZZZZZZZZZZZZZZZZZZZZZZ */ return(-1); diff --git a/src/lib/libcrypto/cpt_err.c b/src/lib/libcrypto/cpt_err.c index 139b9284e4..289005f662 100644 --- a/src/lib/libcrypto/cpt_err.c +++ b/src/lib/libcrypto/cpt_err.c @@ -1,6 +1,6 @@ /* crypto/cpt_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -76,6 +76,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]= {ERR_FUNC(CRYPTO_F_CRYPTO_SET_EX_DATA), "CRYPTO_set_ex_data"}, {ERR_FUNC(CRYPTO_F_DEF_ADD_INDEX), "DEF_ADD_INDEX"}, {ERR_FUNC(CRYPTO_F_DEF_GET_CLASS), "DEF_GET_CLASS"}, +{ERR_FUNC(CRYPTO_F_FIPS_MODE_SET), "FIPS_mode_set"}, {ERR_FUNC(CRYPTO_F_INT_DUP_EX_DATA), "INT_DUP_EX_DATA"}, {ERR_FUNC(CRYPTO_F_INT_FREE_EX_DATA), "INT_FREE_EX_DATA"}, {ERR_FUNC(CRYPTO_F_INT_NEW_EX_DATA), "INT_NEW_EX_DATA"}, @@ -84,6 +85,7 @@ static ERR_STRING_DATA CRYPTO_str_functs[]= static ERR_STRING_DATA CRYPTO_str_reasons[]= { +{ERR_REASON(CRYPTO_R_FIPS_MODE_NOT_SUPPORTED),"fips mode not supported"}, {ERR_REASON(CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK),"no dynlock create callback"}, {0,NULL} }; diff --git a/src/lib/libcrypto/cryptlib.c b/src/lib/libcrypto/cryptlib.c index 24fe123e14..766ea8cac7 100644 --- a/src/lib/libcrypto/cryptlib.c +++ b/src/lib/libcrypto/cryptlib.c @@ -409,6 +409,10 @@ int (*CRYPTO_get_add_lock_callback(void))(int *num,int mount,int type, void CRYPTO_set_locking_callback(void (*func)(int mode,int type, const char *file,int line)) { + /* Calling this here ensures initialisation before any threads + * are started. + */ + OPENSSL_init(); locking_callback=func; } @@ -661,28 +665,52 @@ const char *CRYPTO_get_lock_name(int type) defined(__INTEL__) || \ defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) -unsigned long OPENSSL_ia32cap_P=0; -unsigned long *OPENSSL_ia32cap_loc(void) { return &OPENSSL_ia32cap_P; } +unsigned int OPENSSL_ia32cap_P[2]; +unsigned long *OPENSSL_ia32cap_loc(void) +{ if (sizeof(long)==4) + /* + * If 32-bit application pulls address of OPENSSL_ia32cap_P[0] + * clear second element to maintain the illusion that vector + * is 32-bit. + */ + OPENSSL_ia32cap_P[1]=0; + return (unsigned long *)OPENSSL_ia32cap_P; +} #if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) && !defined(I386_ONLY) #define OPENSSL_CPUID_SETUP +#if defined(_WIN32) +typedef unsigned __int64 IA32CAP; +#else +typedef unsigned long long IA32CAP; +#endif void OPENSSL_cpuid_setup(void) { static int trigger=0; - unsigned long OPENSSL_ia32_cpuid(void); + IA32CAP OPENSSL_ia32_cpuid(void); + IA32CAP vec; char *env; if (trigger) return; trigger=1; - if ((env=getenv("OPENSSL_ia32cap"))) - OPENSSL_ia32cap_P = strtoul(env,NULL,0)|(1<<10); + if ((env=getenv("OPENSSL_ia32cap"))) { + int off = (env[0]=='~')?1:0; +#if defined(_WIN32) + if (!sscanf(env+off,"%I64i",&vec)) vec = strtoul(env+off,NULL,0); +#else + if (!sscanf(env+off,"%lli",(long long *)&vec)) vec = strtoul(env+off,NULL,0); +#endif + if (off) vec = OPENSSL_ia32_cpuid()&~vec; + } else - OPENSSL_ia32cap_P = OPENSSL_ia32_cpuid()|(1<<10); + vec = OPENSSL_ia32_cpuid(); /* * |(1<<10) sets a reserved bit to signal that variable * was initialized already... This is to avoid interference * with cpuid snippets in ELF .init segment. */ + OPENSSL_ia32cap_P[0] = (unsigned int)vec|(1<<10); + OPENSSL_ia32cap_P[1] = (unsigned int)(vec>>32); } #endif diff --git a/src/lib/libcrypto/cryptlib.h b/src/lib/libcrypto/cryptlib.h index fc249c57f3..1761f6b668 100644 --- a/src/lib/libcrypto/cryptlib.h +++ b/src/lib/libcrypto/cryptlib.h @@ -99,7 +99,7 @@ extern "C" { #define HEX_SIZE(type) (sizeof(type)*2) void OPENSSL_cpuid_setup(void); -extern unsigned long OPENSSL_ia32cap_P; +extern unsigned int OPENSSL_ia32cap_P[]; void OPENSSL_showfatal(const char *,...); void *OPENSSL_stderr(void); extern int OPENSSL_NONPIC_relocated; diff --git a/src/lib/libcrypto/crypto.h b/src/lib/libcrypto/crypto.h index b0360cec51..6aeda0a9ac 100644 --- a/src/lib/libcrypto/crypto.h +++ b/src/lib/libcrypto/crypto.h @@ -547,6 +547,33 @@ unsigned long *OPENSSL_ia32cap_loc(void); #define OPENSSL_ia32cap (*(OPENSSL_ia32cap_loc())) int OPENSSL_isservice(void); +int FIPS_mode(void); +int FIPS_mode_set(int r); + +void OPENSSL_init(void); + +#define fips_md_init(alg) fips_md_init_ctx(alg, alg) + +#ifdef OPENSSL_FIPS +#define fips_md_init_ctx(alg, cx) \ + int alg##_Init(cx##_CTX *c) \ + { \ + if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \ + "Low level API call to digest " #alg " forbidden in FIPS mode!"); \ + return private_##alg##_Init(c); \ + } \ + int private_##alg##_Init(cx##_CTX *c) + +#define fips_cipher_abort(alg) \ + if (FIPS_mode()) OpenSSLDie(__FILE__, __LINE__, \ + "Low level API call to cipher " #alg " forbidden in FIPS mode!") + +#else +#define fips_md_init_ctx(alg, cx) \ + int alg##_Init(cx##_CTX *c) +#define fips_cipher_abort(alg) while(0) +#endif + /* BEGIN ERROR CODES */ /* The following lines are auto generated by the script mkerr.pl. Any changes * made after this point may be overwritten when the script is next run. @@ -562,11 +589,13 @@ void ERR_load_CRYPTO_strings(void); #define CRYPTO_F_CRYPTO_SET_EX_DATA 102 #define CRYPTO_F_DEF_ADD_INDEX 104 #define CRYPTO_F_DEF_GET_CLASS 105 +#define CRYPTO_F_FIPS_MODE_SET 109 #define CRYPTO_F_INT_DUP_EX_DATA 106 #define CRYPTO_F_INT_FREE_EX_DATA 107 #define CRYPTO_F_INT_NEW_EX_DATA 108 /* Reason codes. */ +#define CRYPTO_R_FIPS_MODE_NOT_SUPPORTED 101 #define CRYPTO_R_NO_DYNLOCK_CREATE_CALLBACK 100 #ifdef __cplusplus diff --git a/src/lib/libcrypto/des/des.h b/src/lib/libcrypto/des/des.h index 92b6663599..1eaedcbd24 100644 --- a/src/lib/libcrypto/des/des.h +++ b/src/lib/libcrypto/des/des.h @@ -224,6 +224,9 @@ int DES_set_key(const_DES_cblock *key,DES_key_schedule *schedule); int DES_key_sched(const_DES_cblock *key,DES_key_schedule *schedule); int DES_set_key_checked(const_DES_cblock *key,DES_key_schedule *schedule); void DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule); +#ifdef OPENSSL_FIPS +void private_DES_set_key_unchecked(const_DES_cblock *key,DES_key_schedule *schedule); +#endif void DES_string_to_key(const char *str,DES_cblock *key); void DES_string_to_2keys(const char *str,DES_cblock *key1,DES_cblock *key2); void DES_cfb64_encrypt(const unsigned char *in,unsigned char *out,long length, diff --git a/src/lib/libcrypto/des/set_key.c b/src/lib/libcrypto/des/set_key.c index 3004cc3ab3..d3e69ca8b5 100644 --- a/src/lib/libcrypto/des/set_key.c +++ b/src/lib/libcrypto/des/set_key.c @@ -65,6 +65,8 @@ */ #include "des_locl.h" +#include + OPENSSL_IMPLEMENT_GLOBAL(int,DES_check_key,0) /* defaults to false */ static const unsigned char odd_parity[256]={ @@ -335,6 +337,13 @@ int DES_set_key_checked(const_DES_cblock *key, DES_key_schedule *schedule) } void DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(DES); + private_DES_set_key_unchecked(key, schedule); + } +void private_DES_set_key_unchecked(const_DES_cblock *key, DES_key_schedule *schedule) +#endif { static const int shifts2[16]={0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0}; register DES_LONG c,d,t,s,t2; diff --git a/src/lib/libcrypto/dh/dh.h b/src/lib/libcrypto/dh/dh.h index 849309a489..ea59e610ef 100644 --- a/src/lib/libcrypto/dh/dh.h +++ b/src/lib/libcrypto/dh/dh.h @@ -86,6 +86,21 @@ * be used for all exponents. */ +/* If this flag is set the DH method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its reposibility + * to ensure the result is compliant. + */ + +#define DH_FLAG_FIPS_METHOD 0x0400 + +/* If this flag is set the operations normally disabled in FIPS mode are + * permitted it is then the applications responsibility to ensure that the + * usage is compliant. + */ + +#define DH_FLAG_NON_FIPS_ALLOW 0x0400 + #ifdef __cplusplus extern "C" { #endif @@ -230,6 +245,9 @@ void ERR_load_DH_strings(void); #define DH_F_COMPUTE_KEY 102 #define DH_F_DHPARAMS_PRINT_FP 101 #define DH_F_DH_BUILTIN_GENPARAMS 106 +#define DH_F_DH_COMPUTE_KEY 114 +#define DH_F_DH_GENERATE_KEY 115 +#define DH_F_DH_GENERATE_PARAMETERS_EX 116 #define DH_F_DH_NEW_METHOD 105 #define DH_F_DH_PARAM_DECODE 107 #define DH_F_DH_PRIV_DECODE 110 @@ -249,7 +267,9 @@ void ERR_load_DH_strings(void); #define DH_R_DECODE_ERROR 104 #define DH_R_INVALID_PUBKEY 102 #define DH_R_KEYS_NOT_SET 108 +#define DH_R_KEY_SIZE_TOO_SMALL 110 #define DH_R_MODULUS_TOO_LARGE 103 +#define DH_R_NON_FIPS_METHOD 111 #define DH_R_NO_PARAMETERS_SET 107 #define DH_R_NO_PRIVATE_VALUE 100 #define DH_R_PARAMETER_ENCODING_ERROR 105 diff --git a/src/lib/libcrypto/dh/dh_ameth.c b/src/lib/libcrypto/dh/dh_ameth.c index 377caf96c9..02ec2d47b4 100644 --- a/src/lib/libcrypto/dh/dh_ameth.c +++ b/src/lib/libcrypto/dh/dh_ameth.c @@ -493,6 +493,7 @@ const EVP_PKEY_ASN1_METHOD dh_asn1_meth = dh_copy_parameters, dh_cmp_parameters, dh_param_print, + 0, int_dh_free, 0 diff --git a/src/lib/libcrypto/dh/dh_err.c b/src/lib/libcrypto/dh/dh_err.c index d5cf0c22a3..56d3df7356 100644 --- a/src/lib/libcrypto/dh/dh_err.c +++ b/src/lib/libcrypto/dh/dh_err.c @@ -1,6 +1,6 @@ /* crypto/dh/dh_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -73,6 +73,9 @@ static ERR_STRING_DATA DH_str_functs[]= {ERR_FUNC(DH_F_COMPUTE_KEY), "COMPUTE_KEY"}, {ERR_FUNC(DH_F_DHPARAMS_PRINT_FP), "DHparams_print_fp"}, {ERR_FUNC(DH_F_DH_BUILTIN_GENPARAMS), "DH_BUILTIN_GENPARAMS"}, +{ERR_FUNC(DH_F_DH_COMPUTE_KEY), "DH_compute_key"}, +{ERR_FUNC(DH_F_DH_GENERATE_KEY), "DH_generate_key"}, +{ERR_FUNC(DH_F_DH_GENERATE_PARAMETERS_EX), "DH_generate_parameters_ex"}, {ERR_FUNC(DH_F_DH_NEW_METHOD), "DH_new_method"}, {ERR_FUNC(DH_F_DH_PARAM_DECODE), "DH_PARAM_DECODE"}, {ERR_FUNC(DH_F_DH_PRIV_DECODE), "DH_PRIV_DECODE"}, @@ -95,7 +98,9 @@ static ERR_STRING_DATA DH_str_reasons[]= {ERR_REASON(DH_R_DECODE_ERROR) ,"decode error"}, {ERR_REASON(DH_R_INVALID_PUBKEY) ,"invalid public key"}, {ERR_REASON(DH_R_KEYS_NOT_SET) ,"keys not set"}, +{ERR_REASON(DH_R_KEY_SIZE_TOO_SMALL) ,"key size too small"}, {ERR_REASON(DH_R_MODULUS_TOO_LARGE) ,"modulus too large"}, +{ERR_REASON(DH_R_NON_FIPS_METHOD) ,"non fips method"}, {ERR_REASON(DH_R_NO_PARAMETERS_SET) ,"no parameters set"}, {ERR_REASON(DH_R_NO_PRIVATE_VALUE) ,"no private value"}, {ERR_REASON(DH_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"}, diff --git a/src/lib/libcrypto/dh/dh_gen.c b/src/lib/libcrypto/dh/dh_gen.c index cfd5b11868..7b1fe9c9cb 100644 --- a/src/lib/libcrypto/dh/dh_gen.c +++ b/src/lib/libcrypto/dh/dh_gen.c @@ -66,12 +66,29 @@ #include #include +#ifdef OPENSSL_FIPS +#include +#endif + static int dh_builtin_genparams(DH *ret, int prime_len, int generator, BN_GENCB *cb); int DH_generate_parameters_ex(DH *ret, int prime_len, int generator, BN_GENCB *cb) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(ret->meth->flags & DH_FLAG_FIPS_METHOD) + && !(ret->flags & DH_FLAG_NON_FIPS_ALLOW)) + { + DHerr(DH_F_DH_GENERATE_PARAMETERS_EX, DH_R_NON_FIPS_METHOD); + return 0; + } +#endif if(ret->meth->generate_params) return ret->meth->generate_params(ret, prime_len, generator, cb); +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_dh_generate_parameters_ex(ret, prime_len, + generator, cb); +#endif return dh_builtin_genparams(ret, prime_len, generator, cb); } diff --git a/src/lib/libcrypto/dh/dh_key.c b/src/lib/libcrypto/dh/dh_key.c index e7db440342..89a74db4e6 100644 --- a/src/lib/libcrypto/dh/dh_key.c +++ b/src/lib/libcrypto/dh/dh_key.c @@ -73,11 +73,27 @@ static int dh_finish(DH *dh); int DH_generate_key(DH *dh) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD) + && !(dh->flags & DH_FLAG_NON_FIPS_ALLOW)) + { + DHerr(DH_F_DH_GENERATE_KEY, DH_R_NON_FIPS_METHOD); + return 0; + } +#endif return dh->meth->generate_key(dh); } int DH_compute_key(unsigned char *key, const BIGNUM *pub_key, DH *dh) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(dh->meth->flags & DH_FLAG_FIPS_METHOD) + && !(dh->flags & DH_FLAG_NON_FIPS_ALLOW)) + { + DHerr(DH_F_DH_COMPUTE_KEY, DH_R_NON_FIPS_METHOD); + return 0; + } +#endif return dh->meth->compute_key(key, pub_key, dh); } @@ -138,8 +154,21 @@ static int generate_key(DH *dh) if (generate_new_key) { - l = dh->length ? dh->length : BN_num_bits(dh->p)-1; /* secret exponent length */ - if (!BN_rand(priv_key, l, 0, 0)) goto err; + if (dh->q) + { + do + { + if (!BN_rand_range(priv_key, dh->q)) + goto err; + } + while (BN_is_zero(priv_key) || BN_is_one(priv_key)); + } + else + { + /* secret exponent length */ + l = dh->length ? dh->length : BN_num_bits(dh->p)-1; + if (!BN_rand(priv_key, l, 0, 0)) goto err; + } } { diff --git a/src/lib/libcrypto/dh/dh_lib.c b/src/lib/libcrypto/dh/dh_lib.c index 7aef080e7a..00218f2b92 100644 --- a/src/lib/libcrypto/dh/dh_lib.c +++ b/src/lib/libcrypto/dh/dh_lib.c @@ -64,6 +64,10 @@ #include #endif +#ifdef OPENSSL_FIPS +#include +#endif + const char DH_version[]="Diffie-Hellman" OPENSSL_VERSION_PTEXT; static const DH_METHOD *default_DH_method = NULL; @@ -76,7 +80,16 @@ void DH_set_default_method(const DH_METHOD *meth) const DH_METHOD *DH_get_default_method(void) { if(!default_DH_method) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_dh_openssl(); + else + return DH_OpenSSL(); +#else default_DH_method = DH_OpenSSL(); +#endif + } return default_DH_method; } @@ -156,7 +169,7 @@ DH *DH_new_method(ENGINE *engine) ret->counter = NULL; ret->method_mont_p=NULL; ret->references = 1; - ret->flags=ret->meth->flags; + ret->flags=ret->meth->flags & ~DH_FLAG_NON_FIPS_ALLOW; CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DH, ret, &ret->ex_data); if ((ret->meth->init != NULL) && !ret->meth->init(ret)) { diff --git a/src/lib/libcrypto/doc/EVP_DigestInit.pod b/src/lib/libcrypto/doc/EVP_DigestInit.pod index 5b477ac6ec..367691cc7a 100644 --- a/src/lib/libcrypto/doc/EVP_DigestInit.pod +++ b/src/lib/libcrypto/doc/EVP_DigestInit.pod @@ -6,7 +6,8 @@ EVP_MD_CTX_init, EVP_MD_CTX_create, EVP_DigestInit_ex, EVP_DigestUpdate, EVP_DigestFinal_ex, EVP_MD_CTX_cleanup, EVP_MD_CTX_destroy, EVP_MAX_MD_SIZE, EVP_MD_CTX_copy_ex, EVP_MD_CTX_copy, EVP_MD_type, EVP_MD_pkey_type, EVP_MD_size, EVP_MD_block_size, EVP_MD_CTX_md, EVP_MD_CTX_size, EVP_MD_CTX_block_size, EVP_MD_CTX_type, -EVP_md_null, EVP_md2, EVP_md5, EVP_sha, EVP_sha1, EVP_dss, EVP_dss1, EVP_mdc2, +EVP_md_null, EVP_md2, EVP_md5, EVP_sha, EVP_sha1, EVP_sha224, EVP_sha256, +EVP_sha384, EVP_sha512, EVP_dss, EVP_dss1, EVP_mdc2, EVP_ripemd160, EVP_get_digestbyname, EVP_get_digestbynid, EVP_get_digestbyobj - EVP digest routines @@ -33,16 +34,15 @@ EVP digest routines int EVP_MD_CTX_copy(EVP_MD_CTX *out,EVP_MD_CTX *in); - #define EVP_MAX_MD_SIZE (16+20) /* The SSLv3 md5+sha1 type */ + #define EVP_MAX_MD_SIZE 64 /* SHA512 */ + int EVP_MD_type(const EVP_MD *md); + int EVP_MD_pkey_type(const EVP_MD *md); + int EVP_MD_size(const EVP_MD *md); + int EVP_MD_block_size(const EVP_MD *md); - #define EVP_MD_type(e) ((e)->type) - #define EVP_MD_pkey_type(e) ((e)->pkey_type) - #define EVP_MD_size(e) ((e)->md_size) - #define EVP_MD_block_size(e) ((e)->block_size) - - #define EVP_MD_CTX_md(e) (e)->digest) - #define EVP_MD_CTX_size(e) EVP_MD_size((e)->digest) + const EVP_MD *EVP_MD_CTX_md(const EVP_MD_CTX *ctx); + #define EVP_MD_CTX_size(e) EVP_MD_size(EVP_MD_CTX_md(e)) #define EVP_MD_CTX_block_size(e) EVP_MD_block_size((e)->digest) #define EVP_MD_CTX_type(e) EVP_MD_type((e)->digest) @@ -56,6 +56,11 @@ EVP digest routines const EVP_MD *EVP_mdc2(void); const EVP_MD *EVP_ripemd160(void); + const EVP_MD *EVP_sha224(void); + const EVP_MD *EVP_sha256(void); + const EVP_MD *EVP_sha384(void); + const EVP_MD *EVP_sha512(void); + const EVP_MD *EVP_get_digestbyname(const char *name); #define EVP_get_digestbynid(a) EVP_get_digestbyname(OBJ_nid2sn(a)) #define EVP_get_digestbyobj(a) EVP_get_digestbynid(OBJ_obj2nid(a)) @@ -124,12 +129,14 @@ B. EVP_MD_pkey_type() returns the NID of the public key signing algorithm associated with this digest. For example EVP_sha1() is associated with RSA so this will -return B. This "link" between digests and signature -algorithms may not be retained in future versions of OpenSSL. +return B. Since digests and signature algorithms +are no longer linked this function is only retained for compatibility +reasons. -EVP_md2(), EVP_md5(), EVP_sha(), EVP_sha1(), EVP_mdc2() and EVP_ripemd160() -return B structures for the MD2, MD5, SHA, SHA1, MDC2 and RIPEMD160 digest -algorithms respectively. The associated signature algorithm is RSA in each case. +EVP_md2(), EVP_md5(), EVP_sha(), EVP_sha1(), EVP_sha224(), EVP_sha256(), +EVP_sha384(), EVP_sha512(), EVP_mdc2() and EVP_ripemd160() return B +structures for the MD2, MD5, SHA, SHA1, SHA224, SHA256, SHA384, SHA512, MDC2 +and RIPEMD160 digest algorithms respectively. EVP_dss() and EVP_dss1() return B structures for SHA and SHA1 digest algorithms but using DSS (DSA) for the signature algorithm. Note: there is @@ -171,8 +178,8 @@ The B interface to message digests should almost always be used in preference to the low level interfaces. This is because the code then becomes transparent to the digest used and much more flexible. -SHA1 is the digest of choice for new applications. The other digest algorithms -are still in common use. +New applications should use the SHA2 digest algorithms such as SHA256. +The other digest algorithms are still in common use. For most applications the B parameter to EVP_DigestInit_ex() will be set to NULL to use the default digest implementation. @@ -187,6 +194,19 @@ implementations of digests to be specified. In OpenSSL 0.9.7 and later if digest contexts are not cleaned up after use memory leaks will occur. +Stack allocation of EVP_MD_CTX structures is common, for example: + + EVP_MD_CTX mctx; + EVP_MD_CTX_init(&mctx); + +This will cause binary compatibility issues if the size of EVP_MD_CTX +structure changes (this will only happen with a major release of OpenSSL). +Applications wishing to avoid this should use EVP_MD_CTX_create() instead: + + EVP_MD_CTX *mctx; + mctx = EVP_MD_CTX_create(); + + =head1 EXAMPLE This example digests the data "Test Message\n" and "Hello World\n", using the @@ -197,7 +217,7 @@ digest name passed on the command line. main(int argc, char *argv[]) { - EVP_MD_CTX mdctx; + EVP_MD_CTX *mdctx; const EVP_MD *md; char mess1[] = "Test Message\n"; char mess2[] = "Hello World\n"; @@ -218,12 +238,12 @@ digest name passed on the command line. exit(1); } - EVP_MD_CTX_init(&mdctx); - EVP_DigestInit_ex(&mdctx, md, NULL); - EVP_DigestUpdate(&mdctx, mess1, strlen(mess1)); - EVP_DigestUpdate(&mdctx, mess2, strlen(mess2)); - EVP_DigestFinal_ex(&mdctx, md_value, &md_len); - EVP_MD_CTX_cleanup(&mdctx); + mdctx = EVP_MD_CTX_create(); + EVP_DigestInit_ex(mdctx, md, NULL); + EVP_DigestUpdate(mdctx, mess1, strlen(mess1)); + EVP_DigestUpdate(mdctx, mess2, strlen(mess2)); + EVP_DigestFinal_ex(mdctx, md_value, &md_len); + EVP_MD_CTX_destroy(mdctx); printf("Digest is: "); for(i = 0; i < md_len; i++) printf("%02x", md_value[i]); diff --git a/src/lib/libcrypto/dsa/dsa.h b/src/lib/libcrypto/dsa/dsa.h index ac50a5c846..a6f6d0b0b2 100644 --- a/src/lib/libcrypto/dsa/dsa.h +++ b/src/lib/libcrypto/dsa/dsa.h @@ -97,6 +97,21 @@ * be used for all exponents. */ +/* If this flag is set the DSA method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its reposibility + * to ensure the result is compliant. + */ + +#define DSA_FLAG_FIPS_METHOD 0x0400 + +/* If this flag is set the operations normally disabled in FIPS mode are + * permitted it is then the applications responsibility to ensure that the + * usage is compliant. + */ + +#define DSA_FLAG_NON_FIPS_ALLOW 0x0400 + #ifdef __cplusplus extern "C" { #endif @@ -272,6 +287,8 @@ void ERR_load_DSA_strings(void); #define DSA_F_DSAPARAMS_PRINT_FP 101 #define DSA_F_DSA_DO_SIGN 112 #define DSA_F_DSA_DO_VERIFY 113 +#define DSA_F_DSA_GENERATE_KEY 124 +#define DSA_F_DSA_GENERATE_PARAMETERS_EX 123 #define DSA_F_DSA_NEW_METHOD 103 #define DSA_F_DSA_PARAM_DECODE 119 #define DSA_F_DSA_PRINT_FP 105 @@ -282,6 +299,7 @@ void ERR_load_DSA_strings(void); #define DSA_F_DSA_SIGN 106 #define DSA_F_DSA_SIGN_SETUP 107 #define DSA_F_DSA_SIG_NEW 109 +#define DSA_F_DSA_SIG_PRINT 125 #define DSA_F_DSA_VERIFY 108 #define DSA_F_I2D_DSA_SIG 111 #define DSA_F_OLD_DSA_PRIV_DECODE 122 @@ -298,6 +316,8 @@ void ERR_load_DSA_strings(void); #define DSA_R_INVALID_DIGEST_TYPE 106 #define DSA_R_MISSING_PARAMETERS 101 #define DSA_R_MODULUS_TOO_LARGE 103 +#define DSA_R_NEED_NEW_SETUP_VALUES 110 +#define DSA_R_NON_FIPS_DSA_METHOD 111 #define DSA_R_NO_PARAMETERS_SET 107 #define DSA_R_PARAMETER_ENCODING_ERROR 105 diff --git a/src/lib/libcrypto/dsa/dsa_ameth.c b/src/lib/libcrypto/dsa/dsa_ameth.c index 6413aae46e..376156ec5e 100644 --- a/src/lib/libcrypto/dsa/dsa_ameth.c +++ b/src/lib/libcrypto/dsa/dsa_ameth.c @@ -542,6 +542,52 @@ static int old_dsa_priv_encode(const EVP_PKEY *pkey, unsigned char **pder) return i2d_DSAPrivateKey(pkey->pkey.dsa, pder); } +static int dsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, + const ASN1_STRING *sig, + int indent, ASN1_PCTX *pctx) + { + DSA_SIG *dsa_sig; + const unsigned char *p; + if (!sig) + { + if (BIO_puts(bp, "\n") <= 0) + return 0; + else + return 1; + } + p = sig->data; + dsa_sig = d2i_DSA_SIG(NULL, &p, sig->length); + if (dsa_sig) + { + int rv = 0; + size_t buf_len = 0; + unsigned char *m=NULL; + update_buflen(dsa_sig->r, &buf_len); + update_buflen(dsa_sig->s, &buf_len); + m = OPENSSL_malloc(buf_len+10); + if (m == NULL) + { + DSAerr(DSA_F_DSA_SIG_PRINT,ERR_R_MALLOC_FAILURE); + goto err; + } + + if (BIO_write(bp, "\n", 1) != 1) + goto err; + + if (!ASN1_bn_print(bp,"r: ",dsa_sig->r,m,indent)) + goto err; + if (!ASN1_bn_print(bp,"s: ",dsa_sig->s,m,indent)) + goto err; + rv = 1; + err: + if (m) + OPENSSL_free(m); + DSA_SIG_free(dsa_sig); + return rv; + } + return X509_signature_dump(bp, sig, indent); + } + static int dsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) { switch (op) @@ -647,6 +693,7 @@ const EVP_PKEY_ASN1_METHOD dsa_asn1_meths[] = dsa_copy_parameters, dsa_cmp_parameters, dsa_param_print, + dsa_sig_print, int_dsa_free, dsa_pkey_ctrl, diff --git a/src/lib/libcrypto/dsa/dsa_asn1.c b/src/lib/libcrypto/dsa/dsa_asn1.c index c37460b2d6..6058534374 100644 --- a/src/lib/libcrypto/dsa/dsa_asn1.c +++ b/src/lib/libcrypto/dsa/dsa_asn1.c @@ -61,6 +61,7 @@ #include #include #include +#include /* Override the default new methods */ static int sig_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, @@ -87,7 +88,7 @@ ASN1_SEQUENCE_cb(DSA_SIG, sig_cb) = { ASN1_SIMPLE(DSA_SIG, s, CBIGNUM) } ASN1_SEQUENCE_END_cb(DSA_SIG, DSA_SIG) -IMPLEMENT_ASN1_FUNCTIONS_const(DSA_SIG) +IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(DSA_SIG, DSA_SIG, DSA_SIG) /* Override the default free and new methods */ static int dsa_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, @@ -148,3 +149,40 @@ DSA *DSAparams_dup(DSA *dsa) { return ASN1_item_dup(ASN1_ITEM_rptr(DSAparams), dsa); } + +int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig, + unsigned int *siglen, DSA *dsa) + { + DSA_SIG *s; + RAND_seed(dgst, dlen); + s=DSA_do_sign(dgst,dlen,dsa); + if (s == NULL) + { + *siglen=0; + return(0); + } + *siglen=i2d_DSA_SIG(s,&sig); + DSA_SIG_free(s); + return(1); + } + +/* data has already been hashed (probably with SHA or SHA-1). */ +/* returns + * 1: correct signature + * 0: incorrect signature + * -1: error + */ +int DSA_verify(int type, const unsigned char *dgst, int dgst_len, + const unsigned char *sigbuf, int siglen, DSA *dsa) + { + DSA_SIG *s; + int ret=-1; + + s = DSA_SIG_new(); + if (s == NULL) return(ret); + if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err; + ret=DSA_do_verify(dgst,dgst_len,s,dsa); +err: + DSA_SIG_free(s); + return(ret); + } diff --git a/src/lib/libcrypto/dsa/dsa_err.c b/src/lib/libcrypto/dsa/dsa_err.c index bba984e92e..00545b7b9f 100644 --- a/src/lib/libcrypto/dsa/dsa_err.c +++ b/src/lib/libcrypto/dsa/dsa_err.c @@ -1,6 +1,6 @@ /* crypto/dsa/dsa_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -76,6 +76,8 @@ static ERR_STRING_DATA DSA_str_functs[]= {ERR_FUNC(DSA_F_DSAPARAMS_PRINT_FP), "DSAparams_print_fp"}, {ERR_FUNC(DSA_F_DSA_DO_SIGN), "DSA_do_sign"}, {ERR_FUNC(DSA_F_DSA_DO_VERIFY), "DSA_do_verify"}, +{ERR_FUNC(DSA_F_DSA_GENERATE_KEY), "DSA_generate_key"}, +{ERR_FUNC(DSA_F_DSA_GENERATE_PARAMETERS_EX), "DSA_generate_parameters_ex"}, {ERR_FUNC(DSA_F_DSA_NEW_METHOD), "DSA_new_method"}, {ERR_FUNC(DSA_F_DSA_PARAM_DECODE), "DSA_PARAM_DECODE"}, {ERR_FUNC(DSA_F_DSA_PRINT_FP), "DSA_print_fp"}, @@ -86,6 +88,7 @@ static ERR_STRING_DATA DSA_str_functs[]= {ERR_FUNC(DSA_F_DSA_SIGN), "DSA_sign"}, {ERR_FUNC(DSA_F_DSA_SIGN_SETUP), "DSA_sign_setup"}, {ERR_FUNC(DSA_F_DSA_SIG_NEW), "DSA_SIG_new"}, +{ERR_FUNC(DSA_F_DSA_SIG_PRINT), "DSA_SIG_PRINT"}, {ERR_FUNC(DSA_F_DSA_VERIFY), "DSA_verify"}, {ERR_FUNC(DSA_F_I2D_DSA_SIG), "i2d_DSA_SIG"}, {ERR_FUNC(DSA_F_OLD_DSA_PRIV_DECODE), "OLD_DSA_PRIV_DECODE"}, @@ -105,6 +108,8 @@ static ERR_STRING_DATA DSA_str_reasons[]= {ERR_REASON(DSA_R_INVALID_DIGEST_TYPE) ,"invalid digest type"}, {ERR_REASON(DSA_R_MISSING_PARAMETERS) ,"missing parameters"}, {ERR_REASON(DSA_R_MODULUS_TOO_LARGE) ,"modulus too large"}, +{ERR_REASON(DSA_R_NEED_NEW_SETUP_VALUES) ,"need new setup values"}, +{ERR_REASON(DSA_R_NON_FIPS_DSA_METHOD) ,"non fips dsa method"}, {ERR_REASON(DSA_R_NO_PARAMETERS_SET) ,"no parameters set"}, {ERR_REASON(DSA_R_PARAMETER_ENCODING_ERROR),"parameter encoding error"}, {0,NULL} diff --git a/src/lib/libcrypto/dsa/dsa_gen.c b/src/lib/libcrypto/dsa/dsa_gen.c index cb0b4538a4..c398761d0d 100644 --- a/src/lib/libcrypto/dsa/dsa_gen.c +++ b/src/lib/libcrypto/dsa/dsa_gen.c @@ -81,13 +81,33 @@ #include #include "dsa_locl.h" +#ifdef OPENSSL_FIPS +#include +#endif + int DSA_generate_parameters_ex(DSA *ret, int bits, const unsigned char *seed_in, int seed_len, int *counter_ret, unsigned long *h_ret, BN_GENCB *cb) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(ret->meth->flags & DSA_FLAG_FIPS_METHOD) + && !(ret->flags & DSA_FLAG_NON_FIPS_ALLOW)) + { + DSAerr(DSA_F_DSA_GENERATE_PARAMETERS_EX, DSA_R_NON_FIPS_DSA_METHOD); + return 0; + } +#endif if(ret->meth->dsa_paramgen) return ret->meth->dsa_paramgen(ret, bits, seed_in, seed_len, counter_ret, h_ret, cb); +#ifdef OPENSSL_FIPS + else if (FIPS_mode()) + { + return FIPS_dsa_generate_parameters_ex(ret, bits, + seed_in, seed_len, + counter_ret, h_ret, cb); + } +#endif else { const EVP_MD *evpmd; @@ -105,12 +125,13 @@ int DSA_generate_parameters_ex(DSA *ret, int bits, } return dsa_builtin_paramgen(ret, bits, qbits, evpmd, - seed_in, seed_len, counter_ret, h_ret, cb); + seed_in, seed_len, NULL, counter_ret, h_ret, cb); } } int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, + unsigned char *seed_out, int *counter_ret, unsigned long *h_ret, BN_GENCB *cb) { int ok=0; @@ -201,8 +222,10 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, } /* step 2 */ - EVP_Digest(seed, qsize, md, NULL, evpmd, NULL); - EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL); + if (!EVP_Digest(seed, qsize, md, NULL, evpmd, NULL)) + goto err; + if (!EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL)) + goto err; for (i = 0; i < qsize; i++) md[i]^=buf2[i]; @@ -251,7 +274,9 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, break; } - EVP_Digest(buf, qsize, md ,NULL, evpmd, NULL); + if (!EVP_Digest(buf, qsize, md ,NULL, evpmd, + NULL)) + goto err; /* step 8 */ if (!BN_bin2bn(md, qsize, r0)) @@ -332,6 +357,8 @@ err: } if (counter_ret != NULL) *counter_ret=counter; if (h_ret != NULL) *h_ret=h; + if (seed_out) + memcpy(seed_out, seed, qsize); } if(ctx) { diff --git a/src/lib/libcrypto/dsa/dsa_key.c b/src/lib/libcrypto/dsa/dsa_key.c index c4aa86bc6d..9cf669b921 100644 --- a/src/lib/libcrypto/dsa/dsa_key.c +++ b/src/lib/libcrypto/dsa/dsa_key.c @@ -64,12 +64,28 @@ #include #include +#ifdef OPENSSL_FIPS +#include +#endif + static int dsa_builtin_keygen(DSA *dsa); int DSA_generate_key(DSA *dsa) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD) + && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW)) + { + DSAerr(DSA_F_DSA_GENERATE_KEY, DSA_R_NON_FIPS_DSA_METHOD); + return 0; + } +#endif if(dsa->meth->dsa_keygen) return dsa->meth->dsa_keygen(dsa); +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_dsa_generate_key(dsa); +#endif return dsa_builtin_keygen(dsa); } diff --git a/src/lib/libcrypto/dsa/dsa_lib.c b/src/lib/libcrypto/dsa/dsa_lib.c index e9b75902db..96d8d0c4b4 100644 --- a/src/lib/libcrypto/dsa/dsa_lib.c +++ b/src/lib/libcrypto/dsa/dsa_lib.c @@ -70,6 +70,10 @@ #include #endif +#ifdef OPENSSL_FIPS +#include +#endif + const char DSA_version[]="DSA" OPENSSL_VERSION_PTEXT; static const DSA_METHOD *default_DSA_method = NULL; @@ -82,7 +86,16 @@ void DSA_set_default_method(const DSA_METHOD *meth) const DSA_METHOD *DSA_get_default_method(void) { if(!default_DSA_method) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_dsa_openssl(); + else + return DSA_OpenSSL(); +#else default_DSA_method = DSA_OpenSSL(); +#endif + } return default_DSA_method; } @@ -163,7 +176,7 @@ DSA *DSA_new_method(ENGINE *engine) ret->method_mont_p=NULL; ret->references=1; - ret->flags=ret->meth->flags; + ret->flags=ret->meth->flags & ~DSA_FLAG_NON_FIPS_ALLOW; CRYPTO_new_ex_data(CRYPTO_EX_INDEX_DSA, ret, &ret->ex_data); if ((ret->meth->init != NULL) && !ret->meth->init(ret)) { @@ -276,7 +289,8 @@ void *DSA_get_ex_data(DSA *d, int idx) DH *DSA_dup_DH(const DSA *r) { /* DSA has p, q, g, optional pub_key, optional priv_key. - * DH has p, optional length, g, optional pub_key, optional priv_key. + * DH has p, optional length, g, optional pub_key, optional priv_key, + * optional q. */ DH *ret = NULL; @@ -290,7 +304,11 @@ DH *DSA_dup_DH(const DSA *r) if ((ret->p = BN_dup(r->p)) == NULL) goto err; if (r->q != NULL) + { ret->length = BN_num_bits(r->q); + if ((ret->q = BN_dup(r->q)) == NULL) + goto err; + } if (r->g != NULL) if ((ret->g = BN_dup(r->g)) == NULL) goto err; diff --git a/src/lib/libcrypto/dsa/dsa_locl.h b/src/lib/libcrypto/dsa/dsa_locl.h index 2b8cfee3db..21e2e45242 100644 --- a/src/lib/libcrypto/dsa/dsa_locl.h +++ b/src/lib/libcrypto/dsa/dsa_locl.h @@ -56,4 +56,5 @@ int dsa_builtin_paramgen(DSA *ret, size_t bits, size_t qbits, const EVP_MD *evpmd, const unsigned char *seed_in, size_t seed_len, + unsigned char *seed_out, int *counter_ret, unsigned long *h_ret, BN_GENCB *cb); diff --git a/src/lib/libcrypto/dsa/dsa_ossl.c b/src/lib/libcrypto/dsa/dsa_ossl.c index a3ddd7d281..b3d78e524c 100644 --- a/src/lib/libcrypto/dsa/dsa_ossl.c +++ b/src/lib/libcrypto/dsa/dsa_ossl.c @@ -136,6 +136,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa) BN_CTX *ctx=NULL; int reason=ERR_R_BN_LIB; DSA_SIG *ret=NULL; + int noredo = 0; BN_init(&m); BN_init(&xr); @@ -150,7 +151,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa) if (s == NULL) goto err; ctx=BN_CTX_new(); if (ctx == NULL) goto err; - +redo: if ((dsa->kinv == NULL) || (dsa->r == NULL)) { if (!DSA_sign_setup(dsa,ctx,&kinv,&r)) goto err; @@ -161,6 +162,7 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa) dsa->kinv=NULL; r=dsa->r; dsa->r=NULL; + noredo = 1; } @@ -181,6 +183,18 @@ static DSA_SIG *dsa_do_sign(const unsigned char *dgst, int dlen, DSA *dsa) ret=DSA_SIG_new(); if (ret == NULL) goto err; + /* Redo if r or s is zero as required by FIPS 186-3: this is + * very unlikely. + */ + if (BN_is_zero(r) || BN_is_zero(s)) + { + if (noredo) + { + reason = DSA_R_NEED_NEW_SETUP_VALUES; + goto err; + } + goto redo; + } ret->r = r; ret->s = s; diff --git a/src/lib/libcrypto/dsa/dsa_pmeth.c b/src/lib/libcrypto/dsa/dsa_pmeth.c index e2df54fec6..715d8d675b 100644 --- a/src/lib/libcrypto/dsa/dsa_pmeth.c +++ b/src/lib/libcrypto/dsa/dsa_pmeth.c @@ -189,7 +189,9 @@ static int pkey_dsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) EVP_MD_type((const EVP_MD *)p2) != NID_dsa && EVP_MD_type((const EVP_MD *)p2) != NID_dsaWithSHA && EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && - EVP_MD_type((const EVP_MD *)p2) != NID_sha256) + EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && + EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && + EVP_MD_type((const EVP_MD *)p2) != NID_sha512) { DSAerr(DSA_F_PKEY_DSA_CTRL, DSA_R_INVALID_DIGEST_TYPE); return 0; @@ -253,7 +255,7 @@ static int pkey_dsa_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) if (!dsa) return 0; ret = dsa_builtin_paramgen(dsa, dctx->nbits, dctx->qbits, dctx->pmd, - NULL, 0, NULL, NULL, pcb); + NULL, 0, NULL, NULL, NULL, pcb); if (ret) EVP_PKEY_assign_DSA(pkey, dsa); else diff --git a/src/lib/libcrypto/dsa/dsa_sign.c b/src/lib/libcrypto/dsa/dsa_sign.c index 17555e5892..c3cc3642ce 100644 --- a/src/lib/libcrypto/dsa/dsa_sign.c +++ b/src/lib/libcrypto/dsa/dsa_sign.c @@ -61,30 +61,54 @@ #include "cryptlib.h" #include #include +#include DSA_SIG * DSA_do_sign(const unsigned char *dgst, int dlen, DSA *dsa) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD) + && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW)) + { + DSAerr(DSA_F_DSA_DO_SIGN, DSA_R_NON_FIPS_DSA_METHOD); + return NULL; + } +#endif return dsa->meth->dsa_do_sign(dgst, dlen, dsa); } -int DSA_sign(int type, const unsigned char *dgst, int dlen, unsigned char *sig, - unsigned int *siglen, DSA *dsa) +int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp) { - DSA_SIG *s; - RAND_seed(dgst, dlen); - s=DSA_do_sign(dgst,dlen,dsa); - if (s == NULL) +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD) + && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW)) { - *siglen=0; - return(0); + DSAerr(DSA_F_DSA_SIGN_SETUP, DSA_R_NON_FIPS_DSA_METHOD); + return 0; } - *siglen=i2d_DSA_SIG(s,&sig); - DSA_SIG_free(s); - return(1); +#endif + return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp); } -int DSA_sign_setup(DSA *dsa, BN_CTX *ctx_in, BIGNUM **kinvp, BIGNUM **rp) +DSA_SIG *DSA_SIG_new(void) { - return dsa->meth->dsa_sign_setup(dsa, ctx_in, kinvp, rp); + DSA_SIG *sig; + sig = OPENSSL_malloc(sizeof(DSA_SIG)); + if (!sig) + return NULL; + sig->r = NULL; + sig->s = NULL; + return sig; + } + +void DSA_SIG_free(DSA_SIG *sig) + { + if (sig) + { + if (sig->r) + BN_free(sig->r); + if (sig->s) + BN_free(sig->s); + OPENSSL_free(sig); + } } diff --git a/src/lib/libcrypto/dsa/dsa_vrf.c b/src/lib/libcrypto/dsa/dsa_vrf.c index 226a75ff3f..674cb5fa5f 100644 --- a/src/lib/libcrypto/dsa/dsa_vrf.c +++ b/src/lib/libcrypto/dsa/dsa_vrf.c @@ -64,26 +64,13 @@ int DSA_do_verify(const unsigned char *dgst, int dgst_len, DSA_SIG *sig, DSA *dsa) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(dsa->meth->flags & DSA_FLAG_FIPS_METHOD) + && !(dsa->flags & DSA_FLAG_NON_FIPS_ALLOW)) + { + DSAerr(DSA_F_DSA_DO_VERIFY, DSA_R_NON_FIPS_DSA_METHOD); + return -1; + } +#endif return dsa->meth->dsa_do_verify(dgst, dgst_len, sig, dsa); } - -/* data has already been hashed (probably with SHA or SHA-1). */ -/* returns - * 1: correct signature - * 0: incorrect signature - * -1: error - */ -int DSA_verify(int type, const unsigned char *dgst, int dgst_len, - const unsigned char *sigbuf, int siglen, DSA *dsa) - { - DSA_SIG *s; - int ret=-1; - - s = DSA_SIG_new(); - if (s == NULL) return(ret); - if (d2i_DSA_SIG(&s,&sigbuf,siglen) == NULL) goto err; - ret=DSA_do_verify(dgst,dgst_len,s,dsa); -err: - DSA_SIG_free(s); - return(ret); - } diff --git a/src/lib/libcrypto/dso/dso_dlfcn.c b/src/lib/libcrypto/dso/dso_dlfcn.c index c2bc61760b..5f2254806c 100644 --- a/src/lib/libcrypto/dso/dso_dlfcn.c +++ b/src/lib/libcrypto/dso/dso_dlfcn.c @@ -86,7 +86,8 @@ DSO_METHOD *DSO_METHOD_dlfcn(void) # if defined(_AIX) || defined(__CYGWIN__) || \ defined(__SCO_VERSION__) || defined(_SCO_ELF) || \ (defined(__osf__) && !defined(RTLD_NEXT)) || \ - (defined(__OpenBSD__) && !defined(RTLD_SELF)) + (defined(__OpenBSD__) && !defined(RTLD_SELF)) || \ + defined(__ANDROID__) # undef HAVE_DLINFO # endif #endif diff --git a/src/lib/libcrypto/ec/ec.h b/src/lib/libcrypto/ec/ec.h index ee7078130c..9d01325af3 100644 --- a/src/lib/libcrypto/ec/ec.h +++ b/src/lib/libcrypto/ec/ec.h @@ -151,7 +151,24 @@ const EC_METHOD *EC_GFp_mont_method(void); */ const EC_METHOD *EC_GFp_nist_method(void); +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 +/** Returns 64-bit optimized methods for nistp224 + * \return EC_METHOD object + */ +const EC_METHOD *EC_GFp_nistp224_method(void); + +/** Returns 64-bit optimized methods for nistp256 + * \return EC_METHOD object + */ +const EC_METHOD *EC_GFp_nistp256_method(void); + +/** Returns 64-bit optimized methods for nistp521 + * \return EC_METHOD object + */ +const EC_METHOD *EC_GFp_nistp521_method(void); +#endif +#ifndef OPENSSL_NO_EC2M /********************************************************************/ /* EC_METHOD for curves over GF(2^m) */ /********************************************************************/ @@ -161,6 +178,8 @@ const EC_METHOD *EC_GFp_nist_method(void); */ const EC_METHOD *EC_GF2m_simple_method(void); +#endif + /********************************************************************/ /* EC_GROUP functions */ @@ -282,6 +301,7 @@ int EC_GROUP_set_curve_GFp(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, co */ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx); +#ifndef OPENSSL_NO_EC2M /** Sets the parameter of a ec over GF2m defined by y^2 + x*y = x^3 + a*x^2 + b * \param group EC_GROUP object * \param p BIGNUM with the polynomial defining the underlying field @@ -301,7 +321,7 @@ int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, c * \return 1 on success and 0 if an error occured */ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM *b, BN_CTX *ctx); - +#endif /** Returns the number of bits needed to represent a field element * \param group EC_GROUP object * \return number of bits needed to represent a field element @@ -342,7 +362,7 @@ int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ctx); * \return newly created EC_GROUP object with the specified parameters */ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx); - +#ifndef OPENSSL_NO_EC2M /** Creates a new EC_GROUP object with the specified parameters defined * over GF2m (defined by the equation y^2 + x*y = x^3 + a*x^2 + b) * \param p BIGNUM with the polynomial defining the underlying field @@ -352,7 +372,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM * \return newly created EC_GROUP object with the specified parameters */ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx); - +#endif /** Creates a EC_GROUP object with a curve specified by a NID * \param nid NID of the OID of the curve name * \return newly created EC_GROUP object with specified curve or NULL @@ -481,7 +501,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, */ int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *p, const BIGNUM *x, int y_bit, BN_CTX *ctx); - +#ifndef OPENSSL_NO_EC2M /** Sets the affine coordinates of a EC_POINT over GF2m * \param group underlying EC_GROUP object * \param p EC_POINT object @@ -514,7 +534,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, */ int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *p, const BIGNUM *x, int y_bit, BN_CTX *ctx); - +#endif /** Encodes a EC_POINT object to a octet string * \param group underlying EC_GROUP object * \param p EC_POINT object @@ -653,9 +673,11 @@ int EC_GROUP_have_precompute_mult(const EC_GROUP *group); /* EC_GROUP_get_basis_type() returns the NID of the basis type * used to represent the field elements */ int EC_GROUP_get_basis_type(const EC_GROUP *); +#ifndef OPENSSL_NO_EC2M int EC_GROUP_get_trinomial_basis(const EC_GROUP *, unsigned int *k); int EC_GROUP_get_pentanomial_basis(const EC_GROUP *, unsigned int *k1, unsigned int *k2, unsigned int *k3); +#endif #define OPENSSL_EC_NAMED_CURVE 0x001 @@ -689,11 +711,21 @@ typedef struct ec_key_st EC_KEY; #define EC_PKEY_NO_PARAMETERS 0x001 #define EC_PKEY_NO_PUBKEY 0x002 +/* some values for the flags field */ +#define EC_FLAG_NON_FIPS_ALLOW 0x1 +#define EC_FLAG_FIPS_CHECKED 0x2 + /** Creates a new EC_KEY object. * \return EC_KEY object or NULL if an error occurred. */ EC_KEY *EC_KEY_new(void); +int EC_KEY_get_flags(const EC_KEY *key); + +void EC_KEY_set_flags(EC_KEY *key, int flags); + +void EC_KEY_clear_flags(EC_KEY *key, int flags); + /** Creates a new EC_KEY object using a named curve as underlying * EC_GROUP object. * \param nid NID of the named curve. @@ -799,6 +831,15 @@ int EC_KEY_generate_key(EC_KEY *key); */ int EC_KEY_check_key(const EC_KEY *key); +/** Sets a public key from affine coordindates performing + * neccessary NIST PKV tests. + * \param key the EC_KEY object + * \param x public key x coordinate + * \param y public key y coordinate + * \return 1 on success and 0 otherwise. + */ +int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y); + /********************************************************************/ /* de- and encoding functions for SEC1 ECPrivateKey */ @@ -926,6 +967,7 @@ void ERR_load_EC_strings(void); /* Error codes for the EC functions. */ /* Function codes. */ +#define EC_F_BN_TO_FELEM 224 #define EC_F_COMPUTE_WNAF 143 #define EC_F_D2I_ECPARAMETERS 144 #define EC_F_D2I_ECPKPARAMETERS 145 @@ -968,6 +1010,15 @@ void ERR_load_EC_strings(void); #define EC_F_EC_GFP_MONT_FIELD_SQR 132 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE 189 #define EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP 135 +#define EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE 225 +#define EC_F_EC_GFP_NISTP224_POINTS_MUL 228 +#define EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES 226 +#define EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE 230 +#define EC_F_EC_GFP_NISTP256_POINTS_MUL 231 +#define EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES 232 +#define EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE 233 +#define EC_F_EC_GFP_NISTP521_POINTS_MUL 234 +#define EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES 235 #define EC_F_EC_GFP_NIST_FIELD_MUL 200 #define EC_F_EC_GFP_NIST_FIELD_SQR 201 #define EC_F_EC_GFP_NIST_GROUP_SET_CURVE 202 @@ -1010,6 +1061,7 @@ void ERR_load_EC_strings(void); #define EC_F_EC_KEY_NEW 182 #define EC_F_EC_KEY_PRINT 180 #define EC_F_EC_KEY_PRINT_FP 181 +#define EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES 229 #define EC_F_EC_POINTS_MAKE_AFFINE 136 #define EC_F_EC_POINT_ADD 112 #define EC_F_EC_POINT_CMP 113 @@ -1040,6 +1092,9 @@ void ERR_load_EC_strings(void); #define EC_F_I2D_ECPKPARAMETERS 191 #define EC_F_I2D_ECPRIVATEKEY 192 #define EC_F_I2O_ECPUBLICKEY 151 +#define EC_F_NISTP224_PRE_COMP_NEW 227 +#define EC_F_NISTP256_PRE_COMP_NEW 236 +#define EC_F_NISTP521_PRE_COMP_NEW 237 #define EC_F_O2I_ECPUBLICKEY 152 #define EC_F_OLD_EC_PRIV_DECODE 222 #define EC_F_PKEY_EC_CTRL 197 @@ -1052,12 +1107,15 @@ void ERR_load_EC_strings(void); /* Reason codes. */ #define EC_R_ASN1_ERROR 115 #define EC_R_ASN1_UNKNOWN_FIELD 116 +#define EC_R_BIGNUM_OUT_OF_RANGE 144 #define EC_R_BUFFER_TOO_SMALL 100 +#define EC_R_COORDINATES_OUT_OF_RANGE 146 #define EC_R_D2I_ECPKPARAMETERS_FAILURE 117 #define EC_R_DECODE_ERROR 142 #define EC_R_DISCRIMINANT_IS_ZERO 118 #define EC_R_EC_GROUP_NEW_BY_NAME_FAILURE 119 #define EC_R_FIELD_TOO_LARGE 143 +#define EC_R_GF2M_NOT_SUPPORTED 147 #define EC_R_GROUP2PKPARAMETERS_FAILURE 120 #define EC_R_I2D_ECPKPARAMETERS_FAILURE 121 #define EC_R_INCOMPATIBLE_OBJECTS 101 @@ -1092,6 +1150,7 @@ void ERR_load_EC_strings(void); #define EC_R_UNKNOWN_GROUP 129 #define EC_R_UNKNOWN_ORDER 114 #define EC_R_UNSUPPORTED_FIELD 131 +#define EC_R_WRONG_CURVE_PARAMETERS 145 #define EC_R_WRONG_ORDER 130 #ifdef __cplusplus diff --git a/src/lib/libcrypto/ec/ec2_mult.c b/src/lib/libcrypto/ec/ec2_mult.c index e12b9b284a..26f4a783fc 100644 --- a/src/lib/libcrypto/ec/ec2_mult.c +++ b/src/lib/libcrypto/ec/ec2_mult.c @@ -71,6 +71,8 @@ #include "ec_lcl.h" +#ifndef OPENSSL_NO_EC2M + /* Compute the x-coordinate x/z for the point 2*(x/z) in Montgomery projective * coordinates. @@ -384,3 +386,5 @@ int ec_GF2m_have_precompute_mult(const EC_GROUP *group) { return ec_wNAF_have_precompute_mult(group); } + +#endif diff --git a/src/lib/libcrypto/ec/ec2_oct.c b/src/lib/libcrypto/ec/ec2_oct.c new file mode 100644 index 0000000000..f1d75e5ddf --- /dev/null +++ b/src/lib/libcrypto/ec/ec2_oct.c @@ -0,0 +1,407 @@ +/* crypto/ec/ec2_oct.c */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * + * The Elliptic Curve Public-Key Crypto Library (ECC Code) included + * herein is developed by SUN MICROSYSTEMS, INC., and is contributed + * to the OpenSSL project. + * + * The ECC Code is licensed pursuant to the OpenSSL open source + * license provided below. + * + * The software is originally written by Sheueling Chang Shantz and + * Douglas Stebila of Sun Microsystems Laboratories. + * + */ +/* ==================================================================== + * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +#include + +#include "ec_lcl.h" + +#ifndef OPENSSL_NO_EC2M + +/* Calculates and sets the affine coordinates of an EC_POINT from the given + * compressed coordinates. Uses algorithm 2.3.4 of SEC 1. + * Note that the simple implementation only uses affine coordinates. + * + * The method is from the following publication: + * + * Harper, Menezes, Vanstone: + * "Public-Key Cryptosystems with Very Small Key Lengths", + * EUROCRYPT '92, Springer-Verlag LNCS 658, + * published February 1993 + * + * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe + * the same method, but claim no priority date earlier than July 29, 1994 + * (and additionally fail to cite the EUROCRYPT '92 publication as prior art). + */ +int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x_, int y_bit, BN_CTX *ctx) + { + BN_CTX *new_ctx = NULL; + BIGNUM *tmp, *x, *y, *z; + int ret = 0, z0; + + /* clear error queue */ + ERR_clear_error(); + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + y_bit = (y_bit != 0) ? 1 : 0; + + BN_CTX_start(ctx); + tmp = BN_CTX_get(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + z = BN_CTX_get(ctx); + if (z == NULL) goto err; + + if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err; + if (BN_is_zero(x)) + { + if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err; + } + else + { + if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err; + if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err; + if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err; + if (!BN_GF2m_add(tmp, x, tmp)) goto err; + if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx)) + { + unsigned long err = ERR_peek_last_error(); + + if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION) + { + ERR_clear_error(); + ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); + } + else + ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); + goto err; + } + z0 = (BN_is_odd(z)) ? 1 : 0; + if (!group->meth->field_mul(group, y, x, z, ctx)) goto err; + if (z0 != y_bit) + { + if (!BN_GF2m_add(y, y, x)) goto err; + } + } + + if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + + +/* Converts an EC_POINT to an octet string. + * If buf is NULL, the encoded length will be returned. + * If the length len of buf is smaller than required an error will be returned. + */ +size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, + unsigned char *buf, size_t len, BN_CTX *ctx) + { + size_t ret; + BN_CTX *new_ctx = NULL; + int used_ctx = 0; + BIGNUM *x, *y, *yxi; + size_t field_len, i, skip; + + if ((form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); + goto err; + } + + if (EC_POINT_is_at_infinity(group, point)) + { + /* encodes to a single 0 octet */ + if (buf != NULL) + { + if (len < 1) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + buf[0] = 0; + } + return 1; + } + + + /* ret := required output buffer length */ + field_len = (EC_GROUP_get_degree(group) + 7) / 8; + ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + /* if 'buf' is NULL, just return required length */ + if (buf != NULL) + { + if (len < ret) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + goto err; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + used_ctx = 1; + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + yxi = BN_CTX_get(ctx); + if (yxi == NULL) goto err; + + if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; + + buf[0] = form; + if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x)) + { + if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; + if (BN_is_odd(yxi)) buf[0]++; + } + + i = 1; + + skip = field_len - BN_num_bytes(x); + if (skip > field_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(x, buf + i); + i += skip; + if (i != 1 + field_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + + if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) + { + skip = field_len - BN_num_bytes(y); + if (skip > field_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(y, buf + i); + i += skip; + } + + if (i != ret) + { + ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + } + + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + + err: + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return 0; + } + + +/* Converts an octet string representation to an EC_POINT. + * Note that the simple implementation only uses affine coordinates. + */ +int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point, + const unsigned char *buf, size_t len, BN_CTX *ctx) + { + point_conversion_form_t form; + int y_bit; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *yxi; + size_t field_len, enc_len; + int ret = 0; + + if (len == 0) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + form = buf[0]; + y_bit = form & 1; + form = form & ~1U; + if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (form == 0) + { + if (len != 1) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + return EC_POINT_set_to_infinity(group, point); + } + + field_len = (EC_GROUP_get_degree(group) + 7) / 8; + enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + if (len != enc_len) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + yxi = BN_CTX_get(ctx); + if (yxi == NULL) goto err; + + if (!BN_bin2bn(buf + 1, field_len, x)) goto err; + if (BN_ucmp(x, &group->field) >= 0) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + + if (form == POINT_CONVERSION_COMPRESSED) + { + if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err; + } + else + { + if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; + if (BN_ucmp(y, &group->field) >= 0) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + if (form == POINT_CONVERSION_HYBRID) + { + if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; + if (y_bit != BN_is_odd(yxi)) + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + } + + if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; + } + + if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ + { + ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); + goto err; + } + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } +#endif diff --git a/src/lib/libcrypto/ec/ec2_smpl.c b/src/lib/libcrypto/ec/ec2_smpl.c index 03deae6674..e0e59c7d82 100644 --- a/src/lib/libcrypto/ec/ec2_smpl.c +++ b/src/lib/libcrypto/ec/ec2_smpl.c @@ -71,10 +71,20 @@ #include "ec_lcl.h" +#ifndef OPENSSL_NO_EC2M + +#ifdef OPENSSL_FIPS +#include +#endif + const EC_METHOD *EC_GF2m_simple_method(void) { +#ifdef OPENSSL_FIPS + return fips_ec_gf2m_simple_method(); +#else static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, NID_X9_62_characteristic_two_field, ec_GF2m_simple_group_init, ec_GF2m_simple_group_finish, @@ -93,9 +103,7 @@ const EC_METHOD *EC_GF2m_simple_method(void) 0 /* get_Jprojective_coordinates_GFp */, ec_GF2m_simple_point_set_affine_coordinates, ec_GF2m_simple_point_get_affine_coordinates, - ec_GF2m_simple_set_compressed_coordinates, - ec_GF2m_simple_point2oct, - ec_GF2m_simple_oct2point, + 0,0,0, ec_GF2m_simple_add, ec_GF2m_simple_dbl, ec_GF2m_simple_invert, @@ -118,6 +126,7 @@ const EC_METHOD *EC_GF2m_simple_method(void) 0 /* field_set_to_one */ }; return &ret; +#endif } @@ -405,340 +414,6 @@ int ec_GF2m_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_ return ret; } - -/* Calculates and sets the affine coordinates of an EC_POINT from the given - * compressed coordinates. Uses algorithm 2.3.4 of SEC 1. - * Note that the simple implementation only uses affine coordinates. - * - * The method is from the following publication: - * - * Harper, Menezes, Vanstone: - * "Public-Key Cryptosystems with Very Small Key Lengths", - * EUROCRYPT '92, Springer-Verlag LNCS 658, - * published February 1993 - * - * US Patents 6,141,420 and 6,618,483 (Vanstone, Mullin, Agnew) describe - * the same method, but claim no priority date earlier than July 29, 1994 - * (and additionally fail to cite the EUROCRYPT '92 publication as prior art). - */ -int ec_GF2m_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, - const BIGNUM *x_, int y_bit, BN_CTX *ctx) - { - BN_CTX *new_ctx = NULL; - BIGNUM *tmp, *x, *y, *z; - int ret = 0, z0; - - /* clear error queue */ - ERR_clear_error(); - - if (ctx == NULL) - { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) - return 0; - } - - y_bit = (y_bit != 0) ? 1 : 0; - - BN_CTX_start(ctx); - tmp = BN_CTX_get(ctx); - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - z = BN_CTX_get(ctx); - if (z == NULL) goto err; - - if (!BN_GF2m_mod_arr(x, x_, group->poly)) goto err; - if (BN_is_zero(x)) - { - if (!BN_GF2m_mod_sqrt_arr(y, &group->b, group->poly, ctx)) goto err; - } - else - { - if (!group->meth->field_sqr(group, tmp, x, ctx)) goto err; - if (!group->meth->field_div(group, tmp, &group->b, tmp, ctx)) goto err; - if (!BN_GF2m_add(tmp, &group->a, tmp)) goto err; - if (!BN_GF2m_add(tmp, x, tmp)) goto err; - if (!BN_GF2m_mod_solve_quad_arr(z, tmp, group->poly, ctx)) - { - unsigned long err = ERR_peek_last_error(); - - if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NO_SOLUTION) - { - ERR_clear_error(); - ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); - } - else - ECerr(EC_F_EC_GF2M_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); - goto err; - } - z0 = (BN_is_odd(z)) ? 1 : 0; - if (!group->meth->field_mul(group, y, x, z, ctx)) goto err; - if (z0 != y_bit) - { - if (!BN_GF2m_add(y, y, x)) goto err; - } - } - - if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; - - ret = 1; - - err: - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return ret; - } - - -/* Converts an EC_POINT to an octet string. - * If buf is NULL, the encoded length will be returned. - * If the length len of buf is smaller than required an error will be returned. - */ -size_t ec_GF2m_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, - unsigned char *buf, size_t len, BN_CTX *ctx) - { - size_t ret; - BN_CTX *new_ctx = NULL; - int used_ctx = 0; - BIGNUM *x, *y, *yxi; - size_t field_len, i, skip; - - if ((form != POINT_CONVERSION_COMPRESSED) - && (form != POINT_CONVERSION_UNCOMPRESSED) - && (form != POINT_CONVERSION_HYBRID)) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); - goto err; - } - - if (EC_POINT_is_at_infinity(group, point)) - { - /* encodes to a single 0 octet */ - if (buf != NULL) - { - if (len < 1) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); - return 0; - } - buf[0] = 0; - } - return 1; - } - - - /* ret := required output buffer length */ - field_len = (EC_GROUP_get_degree(group) + 7) / 8; - ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; - - /* if 'buf' is NULL, just return required length */ - if (buf != NULL) - { - if (len < ret) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); - goto err; - } - - if (ctx == NULL) - { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) - return 0; - } - - BN_CTX_start(ctx); - used_ctx = 1; - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - yxi = BN_CTX_get(ctx); - if (yxi == NULL) goto err; - - if (!EC_POINT_get_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; - - buf[0] = form; - if ((form != POINT_CONVERSION_UNCOMPRESSED) && !BN_is_zero(x)) - { - if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; - if (BN_is_odd(yxi)) buf[0]++; - } - - i = 1; - - skip = field_len - BN_num_bytes(x); - if (skip > field_len) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - while (skip > 0) - { - buf[i++] = 0; - skip--; - } - skip = BN_bn2bin(x, buf + i); - i += skip; - if (i != 1 + field_len) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - - if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) - { - skip = field_len - BN_num_bytes(y); - if (skip > field_len) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - while (skip > 0) - { - buf[i++] = 0; - skip--; - } - skip = BN_bn2bin(y, buf + i); - i += skip; - } - - if (i != ret) - { - ECerr(EC_F_EC_GF2M_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - } - - if (used_ctx) - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return ret; - - err: - if (used_ctx) - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return 0; - } - - -/* Converts an octet string representation to an EC_POINT. - * Note that the simple implementation only uses affine coordinates. - */ -int ec_GF2m_simple_oct2point(const EC_GROUP *group, EC_POINT *point, - const unsigned char *buf, size_t len, BN_CTX *ctx) - { - point_conversion_form_t form; - int y_bit; - BN_CTX *new_ctx = NULL; - BIGNUM *x, *y, *yxi; - size_t field_len, enc_len; - int ret = 0; - - if (len == 0) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); - return 0; - } - form = buf[0]; - y_bit = form & 1; - form = form & ~1U; - if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) - && (form != POINT_CONVERSION_UNCOMPRESSED) - && (form != POINT_CONVERSION_HYBRID)) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - - if (form == 0) - { - if (len != 1) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - - return EC_POINT_set_to_infinity(group, point); - } - - field_len = (EC_GROUP_get_degree(group) + 7) / 8; - enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; - - if (len != enc_len) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - - if (ctx == NULL) - { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) - return 0; - } - - BN_CTX_start(ctx); - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - yxi = BN_CTX_get(ctx); - if (yxi == NULL) goto err; - - if (!BN_bin2bn(buf + 1, field_len, x)) goto err; - if (BN_ucmp(x, &group->field) >= 0) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - goto err; - } - - if (form == POINT_CONVERSION_COMPRESSED) - { - if (!EC_POINT_set_compressed_coordinates_GF2m(group, point, x, y_bit, ctx)) goto err; - } - else - { - if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; - if (BN_ucmp(y, &group->field) >= 0) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - goto err; - } - if (form == POINT_CONVERSION_HYBRID) - { - if (!group->meth->field_div(group, yxi, y, x, ctx)) goto err; - if (y_bit != BN_is_odd(yxi)) - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - goto err; - } - } - - if (!EC_POINT_set_affine_coordinates_GF2m(group, point, x, y, ctx)) goto err; - } - - if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ - { - ECerr(EC_F_EC_GF2M_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); - goto err; - } - - ret = 1; - - err: - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return ret; - } - - /* Computes a + b and stores the result in r. r could be a or b, a could be b. * Uses algorithm A.10.2 of IEEE P1363. */ @@ -1040,3 +715,5 @@ int ec_GF2m_simple_field_div(const EC_GROUP *group, BIGNUM *r, const BIGNUM *a, { return BN_GF2m_mod_div(r, a, b, &group->field, ctx); } + +#endif diff --git a/src/lib/libcrypto/ec/ec_ameth.c b/src/lib/libcrypto/ec/ec_ameth.c index c00f7d746c..83909c1853 100644 --- a/src/lib/libcrypto/ec/ec_ameth.c +++ b/src/lib/libcrypto/ec/ec_ameth.c @@ -651,6 +651,7 @@ const EVP_PKEY_ASN1_METHOD eckey_asn1_meth = ec_copy_parameters, ec_cmp_parameters, eckey_param_print, + 0, int_ec_free, ec_pkey_ctrl, diff --git a/src/lib/libcrypto/ec/ec_asn1.c b/src/lib/libcrypto/ec/ec_asn1.c index ae55539859..175eec5342 100644 --- a/src/lib/libcrypto/ec/ec_asn1.c +++ b/src/lib/libcrypto/ec/ec_asn1.c @@ -83,7 +83,7 @@ int EC_GROUP_get_basis_type(const EC_GROUP *group) /* everything else is currently not supported */ return 0; } - +#ifndef OPENSSL_NO_EC2M int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) { if (group == NULL) @@ -101,7 +101,6 @@ int EC_GROUP_get_trinomial_basis(const EC_GROUP *group, unsigned int *k) return 1; } - int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, unsigned int *k2, unsigned int *k3) { @@ -124,7 +123,7 @@ int EC_GROUP_get_pentanomial_basis(const EC_GROUP *group, unsigned int *k1, return 1; } - +#endif /* some structures needed for the asn1 encoding */ @@ -340,6 +339,12 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) } } else /* nid == NID_X9_62_characteristic_two_field */ +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_ASN1_GROUP2FIELDID, EC_R_GF2M_NOT_SUPPORTED); + goto err; + } +#else { int field_type; X9_62_CHARACTERISTIC_TWO *char_two; @@ -419,6 +424,7 @@ static int ec_asn1_group2fieldid(const EC_GROUP *group, X9_62_FIELDID *field) } } } +#endif ok = 1; @@ -456,6 +462,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) goto err; } } +#ifndef OPENSSL_NO_EC2M else /* nid == NID_X9_62_characteristic_two_field */ { if (!EC_GROUP_get_curve_GF2m(group, NULL, tmp_1, tmp_2, NULL)) @@ -464,7 +471,7 @@ static int ec_asn1_group2curve(const EC_GROUP *group, X9_62_CURVE *curve) goto err; } } - +#endif len_1 = (size_t)BN_num_bytes(tmp_1); len_2 = (size_t)BN_num_bytes(tmp_2); @@ -775,8 +782,13 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) /* get the field parameters */ tmp = OBJ_obj2nid(params->fieldID->fieldType); - if (tmp == NID_X9_62_characteristic_two_field) +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_ASN1_PARAMETERS2GROUP, EC_R_GF2M_NOT_SUPPORTED); + goto err; + } +#else { X9_62_CHARACTERISTIC_TWO *char_two; @@ -862,6 +874,7 @@ static EC_GROUP *ec_asn1_parameters2group(const ECPARAMETERS *params) /* create the EC_GROUP structure */ ret = EC_GROUP_new_curve_GF2m(p, a, b, NULL); } +#endif else if (tmp == NID_X9_62_prime_field) { /* we have a curve over a prime field */ @@ -1065,6 +1078,7 @@ EC_GROUP *d2i_ECPKParameters(EC_GROUP **a, const unsigned char **in, long len) if ((group = ec_asn1_pkparameters2group(params)) == NULL) { ECerr(EC_F_D2I_ECPKPARAMETERS, EC_R_PKPARAMETERS2GROUP_FAILURE); + ECPKPARAMETERS_free(params); return NULL; } diff --git a/src/lib/libcrypto/ec/ec_curve.c b/src/lib/libcrypto/ec/ec_curve.c index 23274e4031..c72fb2697c 100644 --- a/src/lib/libcrypto/ec/ec_curve.c +++ b/src/lib/libcrypto/ec/ec_curve.c @@ -3,7 +3,7 @@ * Written by Nils Larsch for the OpenSSL project. */ /* ==================================================================== - * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved. + * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -72,6 +72,7 @@ #include "ec_lcl.h" #include #include +#include typedef struct { int field_type, /* either NID_X9_62_prime_field or @@ -703,6 +704,8 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+28*6]; } 0x13,0xDD,0x29,0x45,0x5C,0x5C,0x2A,0x3D } }; +#ifndef OPENSSL_NO_EC2M + /* characteristic two curves */ static const struct { EC_CURVE_DATA h; unsigned char data[20+15*6]; } _EC_SECG_CHAR2_113R1 = { @@ -1300,7 +1303,7 @@ static const struct { EC_CURVE_DATA h; unsigned char data[20+21*6]; } { 0x53,0x81,0x4C,0x05,0x0D,0x44,0xD6,0x96,0xE6,0x76, /* seed */ 0x87,0x56,0x15,0x17,0x58,0x0C,0xA4,0xE2,0x9F,0xFD, - 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ + 0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* p */ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01, 0x07, 0x01,0x08,0xB3,0x9E,0x77,0xC4,0xB1,0x08,0xBE,0xD9, /* a */ @@ -1817,103 +1820,128 @@ static const struct { EC_CURVE_DATA h; unsigned char data[0+24*6]; } 0xBA,0xFC,0xA7,0x5E } }; +#endif + typedef struct _ec_list_element_st { int nid; const EC_CURVE_DATA *data; + const EC_METHOD *(*meth)(void); const char *comment; } ec_list_element; static const ec_list_element curve_list[] = { - /* prime field curves */ + /* prime field curves */ /* secg curves */ - { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, - { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, "SECG curve over a 112 bit prime field"}, - { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, "SECG curve over a 128 bit prime field"}, - { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, "SECG curve over a 128 bit prime field"}, - { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, "SECG curve over a 160 bit prime field"}, - { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, "SECG curve over a 160 bit prime field"}, - { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, + { NID_secp112r1, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, + { NID_secp112r2, &_EC_SECG_PRIME_112R2.h, 0, "SECG curve over a 112 bit prime field" }, + { NID_secp128r1, &_EC_SECG_PRIME_128R1.h, 0, "SECG curve over a 128 bit prime field" }, + { NID_secp128r2, &_EC_SECG_PRIME_128R2.h, 0, "SECG curve over a 128 bit prime field" }, + { NID_secp160k1, &_EC_SECG_PRIME_160K1.h, 0, "SECG curve over a 160 bit prime field" }, + { NID_secp160r1, &_EC_SECG_PRIME_160R1.h, 0, "SECG curve over a 160 bit prime field" }, + { NID_secp160r2, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, /* SECG secp192r1 is the same as X9.62 prime192v1 and hence omitted */ - { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, "SECG curve over a 192 bit prime field"}, - { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, "SECG curve over a 224 bit prime field"}, - { NID_secp224r1, &_EC_NIST_PRIME_224.h, "NIST/SECG curve over a 224 bit prime field"}, - { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, "SECG curve over a 256 bit prime field"}, + { NID_secp192k1, &_EC_SECG_PRIME_192K1.h, 0, "SECG curve over a 192 bit prime field" }, + { NID_secp224k1, &_EC_SECG_PRIME_224K1.h, 0, "SECG curve over a 224 bit prime field" }, +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + { NID_secp224r1, &_EC_NIST_PRIME_224.h, EC_GFp_nistp224_method, "NIST/SECG curve over a 224 bit prime field" }, +#else + { NID_secp224r1, &_EC_NIST_PRIME_224.h, 0, "NIST/SECG curve over a 224 bit prime field" }, +#endif + { NID_secp256k1, &_EC_SECG_PRIME_256K1.h, 0, "SECG curve over a 256 bit prime field" }, /* SECG secp256r1 is the same as X9.62 prime256v1 and hence omitted */ - { NID_secp384r1, &_EC_NIST_PRIME_384.h, "NIST/SECG curve over a 384 bit prime field"}, - { NID_secp521r1, &_EC_NIST_PRIME_521.h, "NIST/SECG curve over a 521 bit prime field"}, + { NID_secp384r1, &_EC_NIST_PRIME_384.h, 0, "NIST/SECG curve over a 384 bit prime field" }, +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + { NID_secp521r1, &_EC_NIST_PRIME_521.h, EC_GFp_nistp521_method, "NIST/SECG curve over a 521 bit prime field" }, +#else + { NID_secp521r1, &_EC_NIST_PRIME_521.h, 0, "NIST/SECG curve over a 521 bit prime field" }, +#endif /* X9.62 curves */ - { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, "NIST/X9.62/SECG curve over a 192 bit prime field"}, - { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, "X9.62 curve over a 192 bit prime field"}, - { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, "X9.62 curve over a 192 bit prime field"}, - { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, "X9.62 curve over a 239 bit prime field"}, - { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, "X9.62 curve over a 239 bit prime field"}, - { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, "X9.62 curve over a 239 bit prime field"}, - { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, "X9.62/SECG curve over a 256 bit prime field"}, + { NID_X9_62_prime192v1, &_EC_NIST_PRIME_192.h, 0, "NIST/X9.62/SECG curve over a 192 bit prime field" }, + { NID_X9_62_prime192v2, &_EC_X9_62_PRIME_192V2.h, 0, "X9.62 curve over a 192 bit prime field" }, + { NID_X9_62_prime192v3, &_EC_X9_62_PRIME_192V3.h, 0, "X9.62 curve over a 192 bit prime field" }, + { NID_X9_62_prime239v1, &_EC_X9_62_PRIME_239V1.h, 0, "X9.62 curve over a 239 bit prime field" }, + { NID_X9_62_prime239v2, &_EC_X9_62_PRIME_239V2.h, 0, "X9.62 curve over a 239 bit prime field" }, + { NID_X9_62_prime239v3, &_EC_X9_62_PRIME_239V3.h, 0, "X9.62 curve over a 239 bit prime field" }, +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, EC_GFp_nistp256_method, "X9.62/SECG curve over a 256 bit prime field" }, +#else + { NID_X9_62_prime256v1, &_EC_X9_62_PRIME_256V1.h, 0, "X9.62/SECG curve over a 256 bit prime field" }, +#endif +#ifndef OPENSSL_NO_EC2M /* characteristic two field curves */ /* NIST/SECG curves */ - { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, - { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, "SECG curve over a 113 bit binary field"}, - { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, "SECG/WTLS curve over a 131 bit binary field"}, - { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, "SECG curve over a 131 bit binary field"}, - { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field" }, - { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, "SECG curve over a 163 bit binary field"}, - { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, "NIST/SECG curve over a 163 bit binary field" }, - { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, "SECG curve over a 193 bit binary field"}, - { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, "SECG curve over a 193 bit binary field"}, - { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, - { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field" }, - { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, "SECG curve over a 239 bit binary field"}, - { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, "NIST/SECG curve over a 283 bit binary field" }, - { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, "NIST/SECG curve over a 283 bit binary field" }, - { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, "NIST/SECG curve over a 409 bit binary field" }, - { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, "NIST/SECG curve over a 409 bit binary field" }, - { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, "NIST/SECG curve over a 571 bit binary field" }, - { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, "NIST/SECG curve over a 571 bit binary field" }, + { NID_sect113r1, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, + { NID_sect113r2, &_EC_SECG_CHAR2_113R2.h, 0, "SECG curve over a 113 bit binary field" }, + { NID_sect131r1, &_EC_SECG_CHAR2_131R1.h, 0, "SECG/WTLS curve over a 131 bit binary field" }, + { NID_sect131r2, &_EC_SECG_CHAR2_131R2.h, 0, "SECG curve over a 131 bit binary field" }, + { NID_sect163k1, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, + { NID_sect163r1, &_EC_SECG_CHAR2_163R1.h, 0, "SECG curve over a 163 bit binary field" }, + { NID_sect163r2, &_EC_NIST_CHAR2_163B.h, 0, "NIST/SECG curve over a 163 bit binary field" }, + { NID_sect193r1, &_EC_SECG_CHAR2_193R1.h, 0, "SECG curve over a 193 bit binary field" }, + { NID_sect193r2, &_EC_SECG_CHAR2_193R2.h, 0, "SECG curve over a 193 bit binary field" }, + { NID_sect233k1, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, + { NID_sect233r1, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, + { NID_sect239k1, &_EC_SECG_CHAR2_239K1.h, 0, "SECG curve over a 239 bit binary field" }, + { NID_sect283k1, &_EC_NIST_CHAR2_283K.h, 0, "NIST/SECG curve over a 283 bit binary field" }, + { NID_sect283r1, &_EC_NIST_CHAR2_283B.h, 0, "NIST/SECG curve over a 283 bit binary field" }, + { NID_sect409k1, &_EC_NIST_CHAR2_409K.h, 0, "NIST/SECG curve over a 409 bit binary field" }, + { NID_sect409r1, &_EC_NIST_CHAR2_409B.h, 0, "NIST/SECG curve over a 409 bit binary field" }, + { NID_sect571k1, &_EC_NIST_CHAR2_571K.h, 0, "NIST/SECG curve over a 571 bit binary field" }, + { NID_sect571r1, &_EC_NIST_CHAR2_571B.h, 0, "NIST/SECG curve over a 571 bit binary field" }, /* X9.62 curves */ - { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, - { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, "X9.62 curve over a 163 bit binary field"}, - { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, "X9.62 curve over a 163 bit binary field"}, - { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, "X9.62 curve over a 176 bit binary field"}, - { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, "X9.62 curve over a 191 bit binary field"}, - { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, "X9.62 curve over a 191 bit binary field"}, - { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, "X9.62 curve over a 191 bit binary field"}, - { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, "X9.62 curve over a 208 bit binary field"}, - { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, "X9.62 curve over a 239 bit binary field"}, - { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, "X9.62 curve over a 239 bit binary field"}, - { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, "X9.62 curve over a 239 bit binary field"}, - { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, "X9.62 curve over a 272 bit binary field"}, - { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, "X9.62 curve over a 304 bit binary field"}, - { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, "X9.62 curve over a 359 bit binary field"}, - { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, "X9.62 curve over a 368 bit binary field"}, - { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, "X9.62 curve over a 431 bit binary field"}, + { NID_X9_62_c2pnb163v1, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, + { NID_X9_62_c2pnb163v2, &_EC_X9_62_CHAR2_163V2.h, 0, "X9.62 curve over a 163 bit binary field" }, + { NID_X9_62_c2pnb163v3, &_EC_X9_62_CHAR2_163V3.h, 0, "X9.62 curve over a 163 bit binary field" }, + { NID_X9_62_c2pnb176v1, &_EC_X9_62_CHAR2_176V1.h, 0, "X9.62 curve over a 176 bit binary field" }, + { NID_X9_62_c2tnb191v1, &_EC_X9_62_CHAR2_191V1.h, 0, "X9.62 curve over a 191 bit binary field" }, + { NID_X9_62_c2tnb191v2, &_EC_X9_62_CHAR2_191V2.h, 0, "X9.62 curve over a 191 bit binary field" }, + { NID_X9_62_c2tnb191v3, &_EC_X9_62_CHAR2_191V3.h, 0, "X9.62 curve over a 191 bit binary field" }, + { NID_X9_62_c2pnb208w1, &_EC_X9_62_CHAR2_208W1.h, 0, "X9.62 curve over a 208 bit binary field" }, + { NID_X9_62_c2tnb239v1, &_EC_X9_62_CHAR2_239V1.h, 0, "X9.62 curve over a 239 bit binary field" }, + { NID_X9_62_c2tnb239v2, &_EC_X9_62_CHAR2_239V2.h, 0, "X9.62 curve over a 239 bit binary field" }, + { NID_X9_62_c2tnb239v3, &_EC_X9_62_CHAR2_239V3.h, 0, "X9.62 curve over a 239 bit binary field" }, + { NID_X9_62_c2pnb272w1, &_EC_X9_62_CHAR2_272W1.h, 0, "X9.62 curve over a 272 bit binary field" }, + { NID_X9_62_c2pnb304w1, &_EC_X9_62_CHAR2_304W1.h, 0, "X9.62 curve over a 304 bit binary field" }, + { NID_X9_62_c2tnb359v1, &_EC_X9_62_CHAR2_359V1.h, 0, "X9.62 curve over a 359 bit binary field" }, + { NID_X9_62_c2pnb368w1, &_EC_X9_62_CHAR2_368W1.h, 0, "X9.62 curve over a 368 bit binary field" }, + { NID_X9_62_c2tnb431r1, &_EC_X9_62_CHAR2_431R1.h, 0, "X9.62 curve over a 431 bit binary field" }, /* the WAP/WTLS curves * [unlike SECG, spec has its own OIDs for curves from X9.62] */ - { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, "WTLS curve over a 113 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, "NIST/SECG/WTLS curve over a 163 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, "SECG curve over a 113 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, "X9.62 curve over a 163 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, "SECG/WTLS curve over a 112 bit prime field"}, - { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, "SECG/WTLS curve over a 160 bit prime field"}, - { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, "WTLS curve over a 112 bit prime field"}, - { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, "WTLS curve over a 160 bit prime field" }, - { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, "NIST/SECG/WTLS curve over a 233 bit binary field"}, - { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, "WTLS curvs over a 224 bit prime field"}, + { NID_wap_wsg_idm_ecid_wtls1, &_EC_WTLS_1.h, 0, "WTLS curve over a 113 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls3, &_EC_NIST_CHAR2_163K.h, 0, "NIST/SECG/WTLS curve over a 163 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls4, &_EC_SECG_CHAR2_113R1.h, 0, "SECG curve over a 113 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls5, &_EC_X9_62_CHAR2_163V1.h, 0, "X9.62 curve over a 163 bit binary field" }, +#endif + { NID_wap_wsg_idm_ecid_wtls6, &_EC_SECG_PRIME_112R1.h, 0, "SECG/WTLS curve over a 112 bit prime field" }, + { NID_wap_wsg_idm_ecid_wtls7, &_EC_SECG_PRIME_160R2.h, 0, "SECG/WTLS curve over a 160 bit prime field" }, + { NID_wap_wsg_idm_ecid_wtls8, &_EC_WTLS_8.h, 0, "WTLS curve over a 112 bit prime field" }, + { NID_wap_wsg_idm_ecid_wtls9, &_EC_WTLS_9.h, 0, "WTLS curve over a 160 bit prime field" }, +#ifndef OPENSSL_NO_EC2M + { NID_wap_wsg_idm_ecid_wtls10, &_EC_NIST_CHAR2_233K.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, + { NID_wap_wsg_idm_ecid_wtls11, &_EC_NIST_CHAR2_233B.h, 0, "NIST/SECG/WTLS curve over a 233 bit binary field" }, +#endif + { NID_wap_wsg_idm_ecid_wtls12, &_EC_WTLS_12.h, 0, "WTLS curvs over a 224 bit prime field" }, +#ifndef OPENSSL_NO_EC2M /* IPSec curves */ - { NID_ipsec3, &_EC_IPSEC_155_ID3.h, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, - { NID_ipsec4, &_EC_IPSEC_185_ID4.h, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n""\tNot suitable for ECDSA.\n\tQuestionable extension field!"}, + { NID_ipsec3, &_EC_IPSEC_155_ID3.h, 0, "\n\tIPSec/IKE/Oakley curve #3 over a 155 bit binary field.\n" + "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, + { NID_ipsec4, &_EC_IPSEC_185_ID4.h, 0, "\n\tIPSec/IKE/Oakley curve #4 over a 185 bit binary field.\n" + "\tNot suitable for ECDSA.\n\tQuestionable extension field!" }, +#endif }; #define curve_list_length (sizeof(curve_list)/sizeof(ec_list_element)) -static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) +static EC_GROUP *ec_group_new_from_data(const ec_list_element curve) { EC_GROUP *group=NULL; EC_POINT *P=NULL; BN_CTX *ctx=NULL; - BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; + BIGNUM *p=NULL, *a=NULL, *b=NULL, *x=NULL, *y=NULL, *order=NULL; int ok=0; int seed_len,param_len; + const EC_METHOD *meth; + const EC_CURVE_DATA *data; const unsigned char *params; if ((ctx = BN_CTX_new()) == NULL) @@ -1922,10 +1950,11 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } + data = curve.data; seed_len = data->seed_len; param_len = data->param_len; - params = (const unsigned char *)(data+1); /* skip header */ - params += seed_len; /* skip seed */ + params = (const unsigned char *)(data+1); /* skip header */ + params += seed_len; /* skip seed */ if (!(p = BN_bin2bn(params+0*param_len, param_len, NULL)) || !(a = BN_bin2bn(params+1*param_len, param_len, NULL)) @@ -1935,7 +1964,17 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } - if (data->field_type == NID_X9_62_prime_field) + if (curve.meth != 0) + { + meth = curve.meth(); + if (((group = EC_GROUP_new(meth)) == NULL) || + (!(group->meth->group_set_curve(group, p, a, b, ctx)))) + { + ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); + goto err; + } + } + else if (data->field_type == NID_X9_62_prime_field) { if ((group = EC_GROUP_new_curve_GFp(p, a, b, ctx)) == NULL) { @@ -1943,6 +1982,7 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } } +#ifndef OPENSSL_NO_EC2M else /* field_type == NID_X9_62_characteristic_two_field */ { if ((group = EC_GROUP_new_curve_GF2m(p, a, b, ctx)) == NULL) @@ -1951,20 +1991,21 @@ static EC_GROUP *ec_group_new_from_data(const EC_CURVE_DATA *data) goto err; } } +#endif if ((P = EC_POINT_new(group)) == NULL) { ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); goto err; } - + if (!(x = BN_bin2bn(params+3*param_len, param_len, NULL)) || !(y = BN_bin2bn(params+4*param_len, param_len, NULL))) { ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_BN_LIB); goto err; } - if (!EC_POINT_set_affine_coordinates_GF2m(group, P, x, y, ctx)) + if (!EC_POINT_set_affine_coordinates_GFp(group, P, x, y, ctx)) { ECerr(EC_F_EC_GROUP_NEW_FROM_DATA, ERR_R_EC_LIB); goto err; @@ -2025,7 +2066,7 @@ EC_GROUP *EC_GROUP_new_by_curve_name(int nid) for (i=0; i + */ + meth = EC_GFp_mont_method(); +#else meth = EC_GFp_nist_method(); +#endif ret = EC_GROUP_new(meth); if (ret == NULL) @@ -122,7 +147,7 @@ EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, const BIGNUM return ret; } - +#ifndef OPENSSL_NO_EC2M EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { const EC_METHOD *meth; @@ -142,3 +167,4 @@ EC_GROUP *EC_GROUP_new_curve_GF2m(const BIGNUM *p, const BIGNUM *a, const BIGNUM return ret; } +#endif diff --git a/src/lib/libcrypto/ec/ec_err.c b/src/lib/libcrypto/ec/ec_err.c index 84b4833371..0d19398731 100644 --- a/src/lib/libcrypto/ec/ec_err.c +++ b/src/lib/libcrypto/ec/ec_err.c @@ -1,6 +1,6 @@ /* crypto/ec/ec_err.c */ /* ==================================================================== - * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,6 +70,7 @@ static ERR_STRING_DATA EC_str_functs[]= { +{ERR_FUNC(EC_F_BN_TO_FELEM), "BN_TO_FELEM"}, {ERR_FUNC(EC_F_COMPUTE_WNAF), "COMPUTE_WNAF"}, {ERR_FUNC(EC_F_D2I_ECPARAMETERS), "d2i_ECParameters"}, {ERR_FUNC(EC_F_D2I_ECPKPARAMETERS), "d2i_ECPKParameters"}, @@ -112,6 +113,15 @@ static ERR_STRING_DATA EC_str_functs[]= {ERR_FUNC(EC_F_EC_GFP_MONT_FIELD_SQR), "ec_GFp_mont_field_sqr"}, {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE), "ec_GFp_mont_group_set_curve"}, {ERR_FUNC(EC_F_EC_GFP_MONT_GROUP_SET_CURVE_GFP), "EC_GFP_MONT_GROUP_SET_CURVE_GFP"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE), "ec_GFp_nistp224_group_set_curve"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINTS_MUL), "ec_GFp_nistp224_points_mul"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp224_point_get_affine_coordinates"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE), "ec_GFp_nistp256_group_set_curve"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINTS_MUL), "ec_GFp_nistp256_points_mul"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp256_point_get_affine_coordinates"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE), "ec_GFp_nistp521_group_set_curve"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINTS_MUL), "ec_GFp_nistp521_points_mul"}, +{ERR_FUNC(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES), "ec_GFp_nistp521_point_get_affine_coordinates"}, {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_MUL), "ec_GFp_nist_field_mul"}, {ERR_FUNC(EC_F_EC_GFP_NIST_FIELD_SQR), "ec_GFp_nist_field_sqr"}, {ERR_FUNC(EC_F_EC_GFP_NIST_GROUP_SET_CURVE), "ec_GFp_nist_group_set_curve"}, @@ -154,6 +164,7 @@ static ERR_STRING_DATA EC_str_functs[]= {ERR_FUNC(EC_F_EC_KEY_NEW), "EC_KEY_new"}, {ERR_FUNC(EC_F_EC_KEY_PRINT), "EC_KEY_print"}, {ERR_FUNC(EC_F_EC_KEY_PRINT_FP), "EC_KEY_print_fp"}, +{ERR_FUNC(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES), "EC_KEY_set_public_key_affine_coordinates"}, {ERR_FUNC(EC_F_EC_POINTS_MAKE_AFFINE), "EC_POINTs_make_affine"}, {ERR_FUNC(EC_F_EC_POINT_ADD), "EC_POINT_add"}, {ERR_FUNC(EC_F_EC_POINT_CMP), "EC_POINT_cmp"}, @@ -184,6 +195,9 @@ static ERR_STRING_DATA EC_str_functs[]= {ERR_FUNC(EC_F_I2D_ECPKPARAMETERS), "i2d_ECPKParameters"}, {ERR_FUNC(EC_F_I2D_ECPRIVATEKEY), "i2d_ECPrivateKey"}, {ERR_FUNC(EC_F_I2O_ECPUBLICKEY), "i2o_ECPublicKey"}, +{ERR_FUNC(EC_F_NISTP224_PRE_COMP_NEW), "NISTP224_PRE_COMP_NEW"}, +{ERR_FUNC(EC_F_NISTP256_PRE_COMP_NEW), "NISTP256_PRE_COMP_NEW"}, +{ERR_FUNC(EC_F_NISTP521_PRE_COMP_NEW), "NISTP521_PRE_COMP_NEW"}, {ERR_FUNC(EC_F_O2I_ECPUBLICKEY), "o2i_ECPublicKey"}, {ERR_FUNC(EC_F_OLD_EC_PRIV_DECODE), "OLD_EC_PRIV_DECODE"}, {ERR_FUNC(EC_F_PKEY_EC_CTRL), "PKEY_EC_CTRL"}, @@ -199,12 +213,15 @@ static ERR_STRING_DATA EC_str_reasons[]= { {ERR_REASON(EC_R_ASN1_ERROR) ,"asn1 error"}, {ERR_REASON(EC_R_ASN1_UNKNOWN_FIELD) ,"asn1 unknown field"}, +{ERR_REASON(EC_R_BIGNUM_OUT_OF_RANGE) ,"bignum out of range"}, {ERR_REASON(EC_R_BUFFER_TOO_SMALL) ,"buffer too small"}, +{ERR_REASON(EC_R_COORDINATES_OUT_OF_RANGE),"coordinates out of range"}, {ERR_REASON(EC_R_D2I_ECPKPARAMETERS_FAILURE),"d2i ecpkparameters failure"}, {ERR_REASON(EC_R_DECODE_ERROR) ,"decode error"}, {ERR_REASON(EC_R_DISCRIMINANT_IS_ZERO) ,"discriminant is zero"}, {ERR_REASON(EC_R_EC_GROUP_NEW_BY_NAME_FAILURE),"ec group new by name failure"}, {ERR_REASON(EC_R_FIELD_TOO_LARGE) ,"field too large"}, +{ERR_REASON(EC_R_GF2M_NOT_SUPPORTED) ,"gf2m not supported"}, {ERR_REASON(EC_R_GROUP2PKPARAMETERS_FAILURE),"group2pkparameters failure"}, {ERR_REASON(EC_R_I2D_ECPKPARAMETERS_FAILURE),"i2d ecpkparameters failure"}, {ERR_REASON(EC_R_INCOMPATIBLE_OBJECTS) ,"incompatible objects"}, @@ -239,6 +256,7 @@ static ERR_STRING_DATA EC_str_reasons[]= {ERR_REASON(EC_R_UNKNOWN_GROUP) ,"unknown group"}, {ERR_REASON(EC_R_UNKNOWN_ORDER) ,"unknown order"}, {ERR_REASON(EC_R_UNSUPPORTED_FIELD) ,"unsupported field"}, +{ERR_REASON(EC_R_WRONG_CURVE_PARAMETERS) ,"wrong curve parameters"}, {ERR_REASON(EC_R_WRONG_ORDER) ,"wrong order"}, {0,NULL} }; diff --git a/src/lib/libcrypto/ec/ec_key.c b/src/lib/libcrypto/ec/ec_key.c index 522802c07a..bf9fd2dc2c 100644 --- a/src/lib/libcrypto/ec/ec_key.c +++ b/src/lib/libcrypto/ec/ec_key.c @@ -64,7 +64,9 @@ #include #include "ec_lcl.h" #include -#include +#ifdef OPENSSL_FIPS +#include +#endif EC_KEY *EC_KEY_new(void) { @@ -78,6 +80,7 @@ EC_KEY *EC_KEY_new(void) } ret->version = 1; + ret->flags = 0; ret->group = NULL; ret->pub_key = NULL; ret->priv_key= NULL; @@ -197,6 +200,7 @@ EC_KEY *EC_KEY_copy(EC_KEY *dest, const EC_KEY *src) dest->enc_flag = src->enc_flag; dest->conv_form = src->conv_form; dest->version = src->version; + dest->flags = src->flags; return dest; } @@ -237,6 +241,11 @@ int EC_KEY_generate_key(EC_KEY *eckey) BIGNUM *priv_key = NULL, *order = NULL; EC_POINT *pub_key = NULL; +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_ec_key_generate_key(eckey); +#endif + if (!eckey || !eckey->group) { ECerr(EC_F_EC_KEY_GENERATE_KEY, ERR_R_PASSED_NULL_PARAMETER); @@ -371,6 +380,82 @@ err: return(ok); } +int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, BIGNUM *x, BIGNUM *y) + { + BN_CTX *ctx = NULL; + BIGNUM *tx, *ty; + EC_POINT *point = NULL; + int ok = 0, tmp_nid, is_char_two = 0; + + if (!key || !key->group || !x || !y) + { + ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, + ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + ctx = BN_CTX_new(); + if (!ctx) + goto err; + + point = EC_POINT_new(key->group); + + if (!point) + goto err; + + tmp_nid = EC_METHOD_get_field_type(EC_GROUP_method_of(key->group)); + + if (tmp_nid == NID_X9_62_characteristic_two_field) + is_char_two = 1; + + tx = BN_CTX_get(ctx); + ty = BN_CTX_get(ctx); +#ifndef OPENSSL_NO_EC2M + if (is_char_two) + { + if (!EC_POINT_set_affine_coordinates_GF2m(key->group, point, + x, y, ctx)) + goto err; + if (!EC_POINT_get_affine_coordinates_GF2m(key->group, point, + tx, ty, ctx)) + goto err; + } + else +#endif + { + if (!EC_POINT_set_affine_coordinates_GFp(key->group, point, + x, y, ctx)) + goto err; + if (!EC_POINT_get_affine_coordinates_GFp(key->group, point, + tx, ty, ctx)) + goto err; + } + /* Check if retrieved coordinates match originals: if not values + * are out of range. + */ + if (BN_cmp(x, tx) || BN_cmp(y, ty)) + { + ECerr(EC_F_EC_KEY_SET_PUBLIC_KEY_AFFINE_COORDINATES, + EC_R_COORDINATES_OUT_OF_RANGE); + goto err; + } + + if (!EC_KEY_set_public_key(key, point)) + goto err; + + if (EC_KEY_check_key(key) == 0) + goto err; + + ok = 1; + + err: + if (ctx) + BN_CTX_free(ctx); + if (point) + EC_POINT_free(point); + return ok; + + } + const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) { return key->group; @@ -461,3 +546,18 @@ int EC_KEY_precompute_mult(EC_KEY *key, BN_CTX *ctx) return 0; return EC_GROUP_precompute_mult(key->group, ctx); } + +int EC_KEY_get_flags(const EC_KEY *key) + { + return key->flags; + } + +void EC_KEY_set_flags(EC_KEY *key, int flags) + { + key->flags |= flags; + } + +void EC_KEY_clear_flags(EC_KEY *key, int flags) + { + key->flags &= ~flags; + } diff --git a/src/lib/libcrypto/ec/ec_lcl.h b/src/lib/libcrypto/ec/ec_lcl.h index 3e2c34b0bc..da7967df38 100644 --- a/src/lib/libcrypto/ec/ec_lcl.h +++ b/src/lib/libcrypto/ec/ec_lcl.h @@ -3,7 +3,7 @@ * Originally written by Bodo Moeller for the OpenSSL project. */ /* ==================================================================== - * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved. + * Copyright (c) 1998-2010 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -82,10 +82,15 @@ # endif #endif +/* Use default functions for poin2oct, oct2point and compressed coordinates */ +#define EC_FLAGS_DEFAULT_OCT 0x1 + /* Structure details are not part of the exported interface, * so all this may change in future versions. */ struct ec_method_st { + /* Various method flags */ + int flags; /* used by EC_METHOD_get_field_type: */ int field_type; /* a NID */ @@ -244,6 +249,7 @@ struct ec_key_st { point_conversion_form_t conv_form; int references; + int flags; EC_EXTRA_DATA *method_data; } /* EC_KEY */; @@ -391,3 +397,50 @@ int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *); int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx); int ec_GF2m_have_precompute_mult(const EC_GROUP *group); + +/* method functions in ec2_mult.c */ +int ec_GF2m_simple_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, + size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *); +int ec_GF2m_precompute_mult(EC_GROUP *group, BN_CTX *ctx); +int ec_GF2m_have_precompute_mult(const EC_GROUP *group); + +#ifndef OPENSSL_EC_NISTP_64_GCC_128 +/* method functions in ecp_nistp224.c */ +int ec_GFp_nistp224_group_init(EC_GROUP *group); +int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *); +int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx); +int ec_GFp_nistp224_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *); +int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx); +int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx); +int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group); + +/* method functions in ecp_nistp256.c */ +int ec_GFp_nistp256_group_init(EC_GROUP *group); +int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *); +int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx); +int ec_GFp_nistp256_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *); +int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx); +int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx); +int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group); + +/* method functions in ecp_nistp521.c */ +int ec_GFp_nistp521_group_init(EC_GROUP *group); +int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *n, BN_CTX *); +int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx); +int ec_GFp_nistp521_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *); +int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *scalar, size_t num, const EC_POINT *points[], const BIGNUM *scalars[], BN_CTX *ctx); +int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx); +int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group); + +/* utility functions in ecp_nistputil.c */ +void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array, + size_t felem_size, void *tmp_felems, + void (*felem_one)(void *out), + int (*felem_is_zero)(const void *in), + void (*felem_assign)(void *out, const void *in), + void (*felem_square)(void *out, const void *in), + void (*felem_mul)(void *out, const void *in1, const void *in2), + void (*felem_inv)(void *out, const void *in), + void (*felem_contract)(void *out, const void *in)); +void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in); +#endif diff --git a/src/lib/libcrypto/ec/ec_lib.c b/src/lib/libcrypto/ec/ec_lib.c index dd7da0fcf9..25247b5803 100644 --- a/src/lib/libcrypto/ec/ec_lib.c +++ b/src/lib/libcrypto/ec/ec_lib.c @@ -425,7 +425,7 @@ int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM * return group->meth->group_get_curve(group, p, a, b, ctx); } - +#ifndef OPENSSL_NO_EC2M int EC_GROUP_set_curve_GF2m(EC_GROUP *group, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { if (group->meth->group_set_curve == 0) @@ -446,7 +446,7 @@ int EC_GROUP_get_curve_GF2m(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, BIGNUM } return group->meth->group_get_curve(group, p, a, b, ctx); } - +#endif int EC_GROUP_get_degree(const EC_GROUP *group) { @@ -856,7 +856,7 @@ int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, return group->meth->point_set_affine_coordinates(group, point, x, y, ctx); } - +#ifndef OPENSSL_NO_EC2M int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) { @@ -872,7 +872,7 @@ int EC_POINT_set_affine_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, } return group->meth->point_set_affine_coordinates(group, point, x, y, ctx); } - +#endif int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) @@ -890,7 +890,7 @@ int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, const EC_POINT *p return group->meth->point_get_affine_coordinates(group, point, x, y, ctx); } - +#ifndef OPENSSL_NO_EC2M int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) { @@ -906,75 +906,7 @@ int EC_POINT_get_affine_coordinates_GF2m(const EC_GROUP *group, const EC_POINT * } return group->meth->point_get_affine_coordinates(group, point, x, y, ctx); } - - -int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, - const BIGNUM *x, int y_bit, BN_CTX *ctx) - { - if (group->meth->point_set_compressed_coordinates == 0) - { - ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (group->meth != point->meth) - { - ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); - } - - -int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, - const BIGNUM *x, int y_bit, BN_CTX *ctx) - { - if (group->meth->point_set_compressed_coordinates == 0) - { - ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (group->meth != point->meth) - { - ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); - } - - -size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, - unsigned char *buf, size_t len, BN_CTX *ctx) - { - if (group->meth->point2oct == 0) - { - ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (group->meth != point->meth) - { - ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return group->meth->point2oct(group, point, form, buf, len, ctx); - } - - -int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, - const unsigned char *buf, size_t len, BN_CTX *ctx) - { - if (group->meth->oct2point == 0) - { - ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (group->meth != point->meth) - { - ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return group->meth->oct2point(group, point, buf, len, ctx); - } - +#endif int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx) { diff --git a/src/lib/libcrypto/ec/ec_oct.c b/src/lib/libcrypto/ec/ec_oct.c new file mode 100644 index 0000000000..fd9db0798d --- /dev/null +++ b/src/lib/libcrypto/ec/ec_oct.c @@ -0,0 +1,199 @@ +/* crypto/ec/ec_lib.c */ +/* + * Originally written by Bodo Moeller for the OpenSSL project. + */ +/* ==================================================================== + * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * Binary polynomial ECC support in OpenSSL originally developed by + * SUN MICROSYSTEMS, INC., and contributed to the OpenSSL project. + */ + +#include + +#include +#include + +#include "ec_lcl.h" + +int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x, int y_bit, BN_CTX *ctx) + { + if (group->meth->point_set_compressed_coordinates == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); + else +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GFP, EC_R_GF2M_NOT_SUPPORTED); + return 0; + } +#else + return ec_GF2m_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); +#endif + } + return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); + } + +#ifndef OPENSSL_NO_EC2M +int EC_POINT_set_compressed_coordinates_GF2m(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x, int y_bit, BN_CTX *ctx) + { + if (group->meth->point_set_compressed_coordinates == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_SET_COMPRESSED_COORDINATES_GF2M, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); + else + return ec_GF2m_simple_set_compressed_coordinates( + group, point, x, y_bit, ctx); + } + return group->meth->point_set_compressed_coordinates(group, point, x, y_bit, ctx); + } +#endif + +size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, + unsigned char *buf, size_t len, BN_CTX *ctx) + { + if (group->meth->point2oct == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_POINT2OCT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_point2oct(group, point, + form, buf, len, ctx); + else +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_POINT_POINT2OCT, EC_R_GF2M_NOT_SUPPORTED); + return 0; + } +#else + return ec_GF2m_simple_point2oct(group, point, + form, buf, len, ctx); +#endif + } + + return group->meth->point2oct(group, point, form, buf, len, ctx); + } + + +int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, + const unsigned char *buf, size_t len, BN_CTX *ctx) + { + if (group->meth->oct2point == 0 + && !(group->meth->flags & EC_FLAGS_DEFAULT_OCT)) + { + ECerr(EC_F_EC_POINT_OCT2POINT, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (group->meth != point->meth) + { + ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if(group->meth->flags & EC_FLAGS_DEFAULT_OCT) + { + if (group->meth->field_type == NID_X9_62_prime_field) + return ec_GFp_simple_oct2point(group, point, + buf, len, ctx); + else +#ifdef OPENSSL_NO_EC2M + { + ECerr(EC_F_EC_POINT_OCT2POINT, EC_R_GF2M_NOT_SUPPORTED); + return 0; + } +#else + return ec_GF2m_simple_oct2point(group, point, + buf, len, ctx); +#endif + } + return group->meth->oct2point(group, point, buf, len, ctx); + } + diff --git a/src/lib/libcrypto/ec/ec_pmeth.c b/src/lib/libcrypto/ec/ec_pmeth.c index f433076ca1..d1ed66c37e 100644 --- a/src/lib/libcrypto/ec/ec_pmeth.c +++ b/src/lib/libcrypto/ec/ec_pmeth.c @@ -221,6 +221,7 @@ static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) case EVP_PKEY_CTRL_MD: if (EVP_MD_type((const EVP_MD *)p2) != NID_sha1 && + EVP_MD_type((const EVP_MD *)p2) != NID_ecdsa_with_SHA1 && EVP_MD_type((const EVP_MD *)p2) != NID_sha224 && EVP_MD_type((const EVP_MD *)p2) != NID_sha256 && EVP_MD_type((const EVP_MD *)p2) != NID_sha384 && diff --git a/src/lib/libcrypto/ec/eck_prn.c b/src/lib/libcrypto/ec/eck_prn.c index 7d3e175ae7..06de8f3959 100644 --- a/src/lib/libcrypto/ec/eck_prn.c +++ b/src/lib/libcrypto/ec/eck_prn.c @@ -207,7 +207,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) reason = ERR_R_MALLOC_FAILURE; goto err; } - +#ifndef OPENSSL_NO_EC2M if (is_char_two) { if (!EC_GROUP_get_curve_GF2m(x, p, a, b, ctx)) @@ -217,6 +217,7 @@ int ECPKParameters_print(BIO *bp, const EC_GROUP *x, int off) } } else /* prime field */ +#endif { if (!EC_GROUP_get_curve_GFp(x, p, a, b, ctx)) { diff --git a/src/lib/libcrypto/ec/ecp_mont.c b/src/lib/libcrypto/ec/ecp_mont.c index 9fc4a466a5..079e47431b 100644 --- a/src/lib/libcrypto/ec/ecp_mont.c +++ b/src/lib/libcrypto/ec/ecp_mont.c @@ -63,12 +63,20 @@ #include +#ifdef OPENSSL_FIPS +#include +#endif + #include "ec_lcl.h" const EC_METHOD *EC_GFp_mont_method(void) { +#ifdef OPENSSL_FIPS + return fips_ec_gfp_mont_method(); +#else static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, NID_X9_62_prime_field, ec_GFp_mont_group_init, ec_GFp_mont_group_finish, @@ -87,9 +95,7 @@ const EC_METHOD *EC_GFp_mont_method(void) ec_GFp_simple_get_Jprojective_coordinates_GFp, ec_GFp_simple_point_set_affine_coordinates, ec_GFp_simple_point_get_affine_coordinates, - ec_GFp_simple_set_compressed_coordinates, - ec_GFp_simple_point2oct, - ec_GFp_simple_oct2point, + 0,0,0, ec_GFp_simple_add, ec_GFp_simple_dbl, ec_GFp_simple_invert, @@ -108,7 +114,9 @@ const EC_METHOD *EC_GFp_mont_method(void) ec_GFp_mont_field_decode, ec_GFp_mont_field_set_to_one }; + return &ret; +#endif } diff --git a/src/lib/libcrypto/ec/ecp_nist.c b/src/lib/libcrypto/ec/ecp_nist.c index 2a5682ea41..aad2d5f443 100644 --- a/src/lib/libcrypto/ec/ecp_nist.c +++ b/src/lib/libcrypto/ec/ecp_nist.c @@ -67,9 +67,17 @@ #include #include "ec_lcl.h" +#ifdef OPENSSL_FIPS +#include +#endif + const EC_METHOD *EC_GFp_nist_method(void) { +#ifdef OPENSSL_FIPS + return fips_ec_gfp_nist_method(); +#else static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, NID_X9_62_prime_field, ec_GFp_simple_group_init, ec_GFp_simple_group_finish, @@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_nist_method(void) ec_GFp_simple_get_Jprojective_coordinates_GFp, ec_GFp_simple_point_set_affine_coordinates, ec_GFp_simple_point_get_affine_coordinates, - ec_GFp_simple_set_compressed_coordinates, - ec_GFp_simple_point2oct, - ec_GFp_simple_oct2point, + 0,0,0, ec_GFp_simple_add, ec_GFp_simple_dbl, ec_GFp_simple_invert, @@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_nist_method(void) 0 /* field_set_to_one */ }; return &ret; +#endif } int ec_GFp_nist_group_copy(EC_GROUP *dest, const EC_GROUP *src) diff --git a/src/lib/libcrypto/ec/ecp_nistp224.c b/src/lib/libcrypto/ec/ecp_nistp224.c new file mode 100644 index 0000000000..b5ff56c252 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp224.c @@ -0,0 +1,1658 @@ +/* crypto/ec/ecp_nistp224.c */ +/* + * Written by Emilia Kasper (Google) for the OpenSSL project. + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A 64-bit implementation of the NIST P-224 elliptic curve point multiplication + * + * Inspired by Daniel J. Bernstein's public domain nistp224 implementation + * and Adam Langley's public domain 64-bit C implementation of curve25519 + */ + +#include +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +#ifndef OPENSSL_SYS_VMS +#include +#else +#include +#endif + +#include +#include +#include "ec_lcl.h" + +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) + /* even with gcc, the typedef won't work for 32-bit platforms */ + typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ +#else + #error "Need GCC 3.1 or later to define type uint128_t" +#endif + +typedef uint8_t u8; +typedef uint64_t u64; +typedef int64_t s64; + + +/******************************************************************************/ +/* INTERNAL REPRESENTATION OF FIELD ELEMENTS + * + * Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3 + * using 64-bit coefficients called 'limbs', + * and sometimes (for multiplication results) as + * b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + 2^336*b_6 + * using 128-bit coefficients called 'widelimbs'. + * A 4-limb representation is an 'felem'; + * a 7-widelimb representation is a 'widefelem'. + * Even within felems, bits of adjacent limbs overlap, and we don't always + * reduce the representations: we ensure that inputs to each felem + * multiplication satisfy a_i < 2^60, so outputs satisfy b_i < 4*2^60*2^60, + * and fit into a 128-bit word without overflow. The coefficients are then + * again partially reduced to obtain an felem satisfying a_i < 2^57. + * We only reduce to the unique minimal representation at the end of the + * computation. + */ + +typedef uint64_t limb; +typedef uint128_t widelimb; + +typedef limb felem[4]; +typedef widelimb widefelem[7]; + +/* Field element represented as a byte arrary. + * 28*8 = 224 bits is also the group order size for the elliptic curve, + * and we also use this type for scalars for point multiplication. + */ +typedef u8 felem_bytearray[28]; + +static const felem_bytearray nistp224_curve_params[5] = { + {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* p */ + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01}, + {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, /* a */ + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFF,0xFF, + 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE}, + {0xB4,0x05,0x0A,0x85,0x0C,0x04,0xB3,0xAB,0xF5,0x41, /* b */ + 0x32,0x56,0x50,0x44,0xB0,0xB7,0xD7,0xBF,0xD8,0xBA, + 0x27,0x0B,0x39,0x43,0x23,0x55,0xFF,0xB4}, + {0xB7,0x0E,0x0C,0xBD,0x6B,0xB4,0xBF,0x7F,0x32,0x13, /* x */ + 0x90,0xB9,0x4A,0x03,0xC1,0xD3,0x56,0xC2,0x11,0x22, + 0x34,0x32,0x80,0xD6,0x11,0x5C,0x1D,0x21}, + {0xbd,0x37,0x63,0x88,0xb5,0xf7,0x23,0xfb,0x4c,0x22, /* y */ + 0xdf,0xe6,0xcd,0x43,0x75,0xa0,0x5a,0x07,0x47,0x64, + 0x44,0xd5,0x81,0x99,0x85,0x00,0x7e,0x34} +}; + +/* Precomputed multiples of the standard generator + * Points are given in coordinates (X, Y, Z) where Z normally is 1 + * (0 for the point at infinity). + * For each field element, slice a_0 is word 0, etc. + * + * The table has 2 * 16 elements, starting with the following: + * index | bits | point + * ------+---------+------------------------------ + * 0 | 0 0 0 0 | 0G + * 1 | 0 0 0 1 | 1G + * 2 | 0 0 1 0 | 2^56G + * 3 | 0 0 1 1 | (2^56 + 1)G + * 4 | 0 1 0 0 | 2^112G + * 5 | 0 1 0 1 | (2^112 + 1)G + * 6 | 0 1 1 0 | (2^112 + 2^56)G + * 7 | 0 1 1 1 | (2^112 + 2^56 + 1)G + * 8 | 1 0 0 0 | 2^168G + * 9 | 1 0 0 1 | (2^168 + 1)G + * 10 | 1 0 1 0 | (2^168 + 2^56)G + * 11 | 1 0 1 1 | (2^168 + 2^56 + 1)G + * 12 | 1 1 0 0 | (2^168 + 2^112)G + * 13 | 1 1 0 1 | (2^168 + 2^112 + 1)G + * 14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G + * 15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G + * followed by a copy of this with each element multiplied by 2^28. + * + * The reason for this is so that we can clock bits into four different + * locations when doing simple scalar multiplies against the base point, + * and then another four locations using the second 16 elements. + */ +static const felem gmul[2][16][3] = +{{{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf}, + {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723}, + {1, 0, 0, 0}}, + {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5}, + {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321}, + {1, 0, 0, 0}}, + {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748}, + {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17}, + {1, 0, 0, 0}}, + {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe}, + {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b}, + {1, 0, 0, 0}}, + {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3}, + {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a}, + {1, 0, 0, 0}}, + {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c}, + {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244}, + {1, 0, 0, 0}}, + {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849}, + {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112}, + {1, 0, 0, 0}}, + {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47}, + {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394}, + {1, 0, 0, 0}}, + {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d}, + {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7}, + {1, 0, 0, 0}}, + {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24}, + {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881}, + {1, 0, 0, 0}}, + {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984}, + {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369}, + {1, 0, 0, 0}}, + {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3}, + {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60}, + {1, 0, 0, 0}}, + {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057}, + {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9}, + {1, 0, 0, 0}}, + {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9}, + {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc}, + {1, 0, 0, 0}}, + {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58}, + {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558}, + {1, 0, 0, 0}}}, + {{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31}, + {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d}, + {1, 0, 0, 0}}, + {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3}, + {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a}, + {1, 0, 0, 0}}, + {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33}, + {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100}, + {1, 0, 0, 0}}, + {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5}, + {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea}, + {1, 0, 0, 0}}, + {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be}, + {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51}, + {1, 0, 0, 0}}, + {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1}, + {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb}, + {1, 0, 0, 0}}, + {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233}, + {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def}, + {1, 0, 0, 0}}, + {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae}, + {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45}, + {1, 0, 0, 0}}, + {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e}, + {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb}, + {1, 0, 0, 0}}, + {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de}, + {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3}, + {1, 0, 0, 0}}, + {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05}, + {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58}, + {1, 0, 0, 0}}, + {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb}, + {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0}, + {1, 0, 0, 0}}, + {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9}, + {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea}, + {1, 0, 0, 0}}, + {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba}, + {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405}, + {1, 0, 0, 0}}, + {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e}, + {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e}, + {1, 0, 0, 0}}}}; + +/* Precomputation for the group generator. */ +typedef struct { + felem g_pre_comp[2][16][3]; + int references; +} NISTP224_PRE_COMP; + +const EC_METHOD *EC_GFp_nistp224_method(void) + { + static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, + NID_X9_62_prime_field, + ec_GFp_nistp224_group_init, + ec_GFp_simple_group_finish, + ec_GFp_simple_group_clear_finish, + ec_GFp_nist_group_copy, + ec_GFp_nistp224_group_set_curve, + ec_GFp_simple_group_get_curve, + ec_GFp_simple_group_get_degree, + ec_GFp_simple_group_check_discriminant, + ec_GFp_simple_point_init, + ec_GFp_simple_point_finish, + ec_GFp_simple_point_clear_finish, + ec_GFp_simple_point_copy, + ec_GFp_simple_point_set_to_infinity, + ec_GFp_simple_set_Jprojective_coordinates_GFp, + ec_GFp_simple_get_Jprojective_coordinates_GFp, + ec_GFp_simple_point_set_affine_coordinates, + ec_GFp_nistp224_point_get_affine_coordinates, + 0 /* point_set_compressed_coordinates */, + 0 /* point2oct */, + 0 /* oct2point */, + ec_GFp_simple_add, + ec_GFp_simple_dbl, + ec_GFp_simple_invert, + ec_GFp_simple_is_at_infinity, + ec_GFp_simple_is_on_curve, + ec_GFp_simple_cmp, + ec_GFp_simple_make_affine, + ec_GFp_simple_points_make_affine, + ec_GFp_nistp224_points_mul, + ec_GFp_nistp224_precompute_mult, + ec_GFp_nistp224_have_precompute_mult, + ec_GFp_nist_field_mul, + ec_GFp_nist_field_sqr, + 0 /* field_div */, + 0 /* field_encode */, + 0 /* field_decode */, + 0 /* field_set_to_one */ }; + + return &ret; + } + +/* Helper functions to convert field elements to/from internal representation */ +static void bin28_to_felem(felem out, const u8 in[28]) + { + out[0] = *((const uint64_t *)(in)) & 0x00ffffffffffffff; + out[1] = (*((const uint64_t *)(in+7))) & 0x00ffffffffffffff; + out[2] = (*((const uint64_t *)(in+14))) & 0x00ffffffffffffff; + out[3] = (*((const uint64_t *)(in+21))) & 0x00ffffffffffffff; + } + +static void felem_to_bin28(u8 out[28], const felem in) + { + unsigned i; + for (i = 0; i < 7; ++i) + { + out[i] = in[0]>>(8*i); + out[i+7] = in[1]>>(8*i); + out[i+14] = in[2]>>(8*i); + out[i+21] = in[3]>>(8*i); + } + } + +/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ +static void flip_endian(u8 *out, const u8 *in, unsigned len) + { + unsigned i; + for (i = 0; i < len; ++i) + out[i] = in[len-1-i]; + } + +/* From OpenSSL BIGNUM to internal representation */ +static int BN_to_felem(felem out, const BIGNUM *bn) + { + felem_bytearray b_in; + felem_bytearray b_out; + unsigned num_bytes; + + /* BN_bn2bin eats leading zeroes */ + memset(b_out, 0, sizeof b_out); + num_bytes = BN_num_bytes(bn); + if (num_bytes > sizeof b_out) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + if (BN_is_negative(bn)) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2bin(bn, b_in); + flip_endian(b_out, b_in, num_bytes); + bin28_to_felem(out, b_out); + return 1; + } + +/* From internal representation to OpenSSL BIGNUM */ +static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) + { + felem_bytearray b_in, b_out; + felem_to_bin28(b_in, in); + flip_endian(b_out, b_in, sizeof b_out); + return BN_bin2bn(b_out, sizeof b_out, out); + } + +/******************************************************************************/ +/* FIELD OPERATIONS + * + * Field operations, using the internal representation of field elements. + * NB! These operations are specific to our point multiplication and cannot be + * expected to be correct in general - e.g., multiplication with a large scalar + * will cause an overflow. + * + */ + +static void felem_one(felem out) + { + out[0] = 1; + out[1] = 0; + out[2] = 0; + out[3] = 0; + } + +static void felem_assign(felem out, const felem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +/* Sum two field elements: out += in */ +static void felem_sum(felem out, const felem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + } + +/* Get negative value: out = -in */ +/* Assumes in[i] < 2^57 */ +static void felem_neg(felem out, const felem in) + { + static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); + static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); + static const limb two58m42m2 = (((limb) 1) << 58) - + (((limb) 1) << 42) - (((limb) 1) << 2); + + /* Set to 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] = two58p2 - in[0]; + out[1] = two58m42m2 - in[1]; + out[2] = two58m2 - in[2]; + out[3] = two58m2 - in[3]; + } + +/* Subtract field elements: out -= in */ +/* Assumes in[i] < 2^57 */ +static void felem_diff(felem out, const felem in) + { + static const limb two58p2 = (((limb) 1) << 58) + (((limb) 1) << 2); + static const limb two58m2 = (((limb) 1) << 58) - (((limb) 1) << 2); + static const limb two58m42m2 = (((limb) 1) << 58) - + (((limb) 1) << 42) - (((limb) 1) << 2); + + /* Add 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] += two58p2; + out[1] += two58m42m2; + out[2] += two58m2; + out[3] += two58m2; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +/* Subtract in unreduced 128-bit mode: out -= in */ +/* Assumes in[i] < 2^119 */ +static void widefelem_diff(widefelem out, const widefelem in) + { + static const widelimb two120 = ((widelimb) 1) << 120; + static const widelimb two120m64 = (((widelimb) 1) << 120) - + (((widelimb) 1) << 64); + static const widelimb two120m104m64 = (((widelimb) 1) << 120) - + (((widelimb) 1) << 104) - (((widelimb) 1) << 64); + + /* Add 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] += two120; + out[1] += two120m64; + out[2] += two120m64; + out[3] += two120; + out[4] += two120m104m64; + out[5] += two120m64; + out[6] += two120m64; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + out[4] -= in[4]; + out[5] -= in[5]; + out[6] -= in[6]; + } + +/* Subtract in mixed mode: out128 -= in64 */ +/* in[i] < 2^63 */ +static void felem_diff_128_64(widefelem out, const felem in) + { + static const widelimb two64p8 = (((widelimb) 1) << 64) + + (((widelimb) 1) << 8); + static const widelimb two64m8 = (((widelimb) 1) << 64) - + (((widelimb) 1) << 8); + static const widelimb two64m48m8 = (((widelimb) 1) << 64) - + (((widelimb) 1) << 48) - (((widelimb) 1) << 8); + + /* Add 0 mod 2^224-2^96+1 to ensure out > in */ + out[0] += two64p8; + out[1] += two64m48m8; + out[2] += two64m8; + out[3] += two64m8; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +/* Multiply a field element by a scalar: out = out * scalar + * The scalars we actually use are small, so results fit without overflow */ +static void felem_scalar(felem out, const limb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + } + +/* Multiply an unreduced field element by a scalar: out = out * scalar + * The scalars we actually use are small, so results fit without overflow */ +static void widefelem_scalar(widefelem out, const widelimb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + } + +/* Square a field element: out = in^2 */ +static void felem_square(widefelem out, const felem in) + { + limb tmp0, tmp1, tmp2; + tmp0 = 2 * in[0]; tmp1 = 2 * in[1]; tmp2 = 2 * in[2]; + out[0] = ((widelimb) in[0]) * in[0]; + out[1] = ((widelimb) in[0]) * tmp1; + out[2] = ((widelimb) in[0]) * tmp2 + ((widelimb) in[1]) * in[1]; + out[3] = ((widelimb) in[3]) * tmp0 + + ((widelimb) in[1]) * tmp2; + out[4] = ((widelimb) in[3]) * tmp1 + ((widelimb) in[2]) * in[2]; + out[5] = ((widelimb) in[3]) * tmp2; + out[6] = ((widelimb) in[3]) * in[3]; + } + +/* Multiply two field elements: out = in1 * in2 */ +static void felem_mul(widefelem out, const felem in1, const felem in2) + { + out[0] = ((widelimb) in1[0]) * in2[0]; + out[1] = ((widelimb) in1[0]) * in2[1] + ((widelimb) in1[1]) * in2[0]; + out[2] = ((widelimb) in1[0]) * in2[2] + ((widelimb) in1[1]) * in2[1] + + ((widelimb) in1[2]) * in2[0]; + out[3] = ((widelimb) in1[0]) * in2[3] + ((widelimb) in1[1]) * in2[2] + + ((widelimb) in1[2]) * in2[1] + ((widelimb) in1[3]) * in2[0]; + out[4] = ((widelimb) in1[1]) * in2[3] + ((widelimb) in1[2]) * in2[2] + + ((widelimb) in1[3]) * in2[1]; + out[5] = ((widelimb) in1[2]) * in2[3] + ((widelimb) in1[3]) * in2[2]; + out[6] = ((widelimb) in1[3]) * in2[3]; + } + +/* Reduce seven 128-bit coefficients to four 64-bit coefficients. + * Requires in[i] < 2^126, + * ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 */ +static void felem_reduce(felem out, const widefelem in) + { + static const widelimb two127p15 = (((widelimb) 1) << 127) + + (((widelimb) 1) << 15); + static const widelimb two127m71 = (((widelimb) 1) << 127) - + (((widelimb) 1) << 71); + static const widelimb two127m71m55 = (((widelimb) 1) << 127) - + (((widelimb) 1) << 71) - (((widelimb) 1) << 55); + widelimb output[5]; + + /* Add 0 mod 2^224-2^96+1 to ensure all differences are positive */ + output[0] = in[0] + two127p15; + output[1] = in[1] + two127m71m55; + output[2] = in[2] + two127m71; + output[3] = in[3]; + output[4] = in[4]; + + /* Eliminate in[4], in[5], in[6] */ + output[4] += in[6] >> 16; + output[3] += (in[6] & 0xffff) << 40; + output[2] -= in[6]; + + output[3] += in[5] >> 16; + output[2] += (in[5] & 0xffff) << 40; + output[1] -= in[5]; + + output[2] += output[4] >> 16; + output[1] += (output[4] & 0xffff) << 40; + output[0] -= output[4]; + + /* Carry 2 -> 3 -> 4 */ + output[3] += output[2] >> 56; + output[2] &= 0x00ffffffffffffff; + + output[4] = output[3] >> 56; + output[3] &= 0x00ffffffffffffff; + + /* Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 */ + + /* Eliminate output[4] */ + output[2] += output[4] >> 16; + /* output[2] < 2^56 + 2^56 = 2^57 */ + output[1] += (output[4] & 0xffff) << 40; + output[0] -= output[4]; + + /* Carry 0 -> 1 -> 2 -> 3 */ + output[1] += output[0] >> 56; + out[0] = output[0] & 0x00ffffffffffffff; + + output[2] += output[1] >> 56; + /* output[2] < 2^57 + 2^72 */ + out[1] = output[1] & 0x00ffffffffffffff; + output[3] += output[2] >> 56; + /* output[3] <= 2^56 + 2^16 */ + out[2] = output[2] & 0x00ffffffffffffff; + + /* out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, + * out[3] <= 2^56 + 2^16 (due to final carry), + * so out < 2*p */ + out[3] = output[3]; + } + +static void felem_square_reduce(felem out, const felem in) + { + widefelem tmp; + felem_square(tmp, in); + felem_reduce(out, tmp); + } + +static void felem_mul_reduce(felem out, const felem in1, const felem in2) + { + widefelem tmp; + felem_mul(tmp, in1, in2); + felem_reduce(out, tmp); + } + +/* Reduce to unique minimal representation. + * Requires 0 <= in < 2*p (always call felem_reduce first) */ +static void felem_contract(felem out, const felem in) + { + static const int64_t two56 = ((limb) 1) << 56; + /* 0 <= in < 2*p, p = 2^224 - 2^96 + 1 */ + /* if in > p , reduce in = in - 2^224 + 2^96 - 1 */ + int64_t tmp[4], a; + tmp[0] = in[0]; + tmp[1] = in[1]; + tmp[2] = in[2]; + tmp[3] = in[3]; + /* Case 1: a = 1 iff in >= 2^224 */ + a = (in[3] >> 56); + tmp[0] -= a; + tmp[1] += a << 40; + tmp[3] &= 0x00ffffffffffffff; + /* Case 2: a = 0 iff p <= in < 2^224, i.e., + * the high 128 bits are all 1 and the lower part is non-zero */ + a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) | + (((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63); + a &= 0x00ffffffffffffff; + /* turn a into an all-one mask (if a = 0) or an all-zero mask */ + a = (a - 1) >> 63; + /* subtract 2^224 - 2^96 + 1 if a is all-one*/ + tmp[3] &= a ^ 0xffffffffffffffff; + tmp[2] &= a ^ 0xffffffffffffffff; + tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff; + tmp[0] -= 1 & a; + + /* eliminate negative coefficients: if tmp[0] is negative, tmp[1] must + * be non-zero, so we only need one step */ + a = tmp[0] >> 63; + tmp[0] += two56 & a; + tmp[1] -= 1 & a; + + /* carry 1 -> 2 -> 3 */ + tmp[2] += tmp[1] >> 56; + tmp[1] &= 0x00ffffffffffffff; + + tmp[3] += tmp[2] >> 56; + tmp[2] &= 0x00ffffffffffffff; + + /* Now 0 <= out < p */ + out[0] = tmp[0]; + out[1] = tmp[1]; + out[2] = tmp[2]; + out[3] = tmp[3]; + } + +/* Zero-check: returns 1 if input is 0, and 0 otherwise. + * We know that field elements are reduced to in < 2^225, + * so we only need to check three cases: 0, 2^224 - 2^96 + 1, + * and 2^225 - 2^97 + 2 */ +static limb felem_is_zero(const felem in) + { + limb zero, two224m96p1, two225m97p2; + + zero = in[0] | in[1] | in[2] | in[3]; + zero = (((int64_t)(zero) - 1) >> 63) & 1; + two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000) + | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x00ffffffffffffff); + two224m96p1 = (((int64_t)(two224m96p1) - 1) >> 63) & 1; + two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000) + | (in[2] ^ 0x00ffffffffffffff) | (in[3] ^ 0x01ffffffffffffff); + two225m97p2 = (((int64_t)(two225m97p2) - 1) >> 63) & 1; + return (zero | two224m96p1 | two225m97p2); + } + +static limb felem_is_zero_int(const felem in) + { + return (int) (felem_is_zero(in) & ((limb)1)); + } + +/* Invert a field element */ +/* Computation chain copied from djb's code */ +static void felem_inv(felem out, const felem in) + { + felem ftmp, ftmp2, ftmp3, ftmp4; + widefelem tmp; + unsigned i; + + felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^4 - 2 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^5 - 4 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^6 - 8 */ + felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp2, tmp); /* 2^7 - 2 */ + for (i = 0; i < 5; ++i) /* 2^12 - 2^6 */ + { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } + felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp2, tmp); /* 2^12 - 1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^13 - 2 */ + for (i = 0; i < 11; ++i) /* 2^24 - 2^12 */ + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp2, tmp); /* 2^24 - 1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^25 - 2 */ + for (i = 0; i < 23; ++i) /* 2^48 - 2^24 */ + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^48 - 1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^49 - 2 */ + for (i = 0; i < 47; ++i) /* 2^96 - 2^48 */ + { + felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); + } + felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^96 - 1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp4, tmp); /* 2^97 - 2 */ + for (i = 0; i < 23; ++i) /* 2^120 - 2^24 */ + { + felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); + } + felem_mul(tmp, ftmp2, ftmp4); felem_reduce(ftmp2, tmp); /* 2^120 - 1 */ + for (i = 0; i < 6; ++i) /* 2^126 - 2^6 */ + { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } + felem_mul(tmp, ftmp2, ftmp); felem_reduce(ftmp, tmp); /* 2^126 - 1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^127 - 2 */ + felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^127 - 1 */ + for (i = 0; i < 97; ++i) /* 2^224 - 2^97 */ + { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } + felem_mul(tmp, ftmp, ftmp3); felem_reduce(out, tmp); /* 2^224 - 2^96 - 1 */ + } + +/* Copy in constant time: + * if icopy == 1, copy in to out, + * if icopy == 0, copy out to itself. */ +static void +copy_conditional(felem out, const felem in, limb icopy) + { + unsigned i; + /* icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one */ + const limb copy = -icopy; + for (i = 0; i < 4; ++i) + { + const limb tmp = copy & (in[i] ^ out[i]); + out[i] ^= tmp; + } + } + +/******************************************************************************/ +/* ELLIPTIC CURVE POINT OPERATIONS + * + * Points are represented in Jacobian projective coordinates: + * (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3), + * or to the point at infinity if Z == 0. + * + */ + +/* Double an elliptic curve point: + * (X', Y', Z') = 2 * (X, Y, Z), where + * X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2 + * Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2 + * Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z + * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed, + * while x_out == y_in is not (maybe this works, but it's not tested). */ +static void +point_double(felem x_out, felem y_out, felem z_out, + const felem x_in, const felem y_in, const felem z_in) + { + widefelem tmp, tmp2; + felem delta, gamma, beta, alpha, ftmp, ftmp2; + + felem_assign(ftmp, x_in); + felem_assign(ftmp2, x_in); + + /* delta = z^2 */ + felem_square(tmp, z_in); + felem_reduce(delta, tmp); + + /* gamma = y^2 */ + felem_square(tmp, y_in); + felem_reduce(gamma, tmp); + + /* beta = x*gamma */ + felem_mul(tmp, x_in, gamma); + felem_reduce(beta, tmp); + + /* alpha = 3*(x-delta)*(x+delta) */ + felem_diff(ftmp, delta); + /* ftmp[i] < 2^57 + 2^58 + 2 < 2^59 */ + felem_sum(ftmp2, delta); + /* ftmp2[i] < 2^57 + 2^57 = 2^58 */ + felem_scalar(ftmp2, 3); + /* ftmp2[i] < 3 * 2^58 < 2^60 */ + felem_mul(tmp, ftmp, ftmp2); + /* tmp[i] < 2^60 * 2^59 * 4 = 2^121 */ + felem_reduce(alpha, tmp); + + /* x' = alpha^2 - 8*beta */ + felem_square(tmp, alpha); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + felem_assign(ftmp, beta); + felem_scalar(ftmp, 8); + /* ftmp[i] < 8 * 2^57 = 2^60 */ + felem_diff_128_64(tmp, ftmp); + /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ + felem_reduce(x_out, tmp); + + /* z' = (y + z)^2 - gamma - delta */ + felem_sum(delta, gamma); + /* delta[i] < 2^57 + 2^57 = 2^58 */ + felem_assign(ftmp, y_in); + felem_sum(ftmp, z_in); + /* ftmp[i] < 2^57 + 2^57 = 2^58 */ + felem_square(tmp, ftmp); + /* tmp[i] < 4 * 2^58 * 2^58 = 2^118 */ + felem_diff_128_64(tmp, delta); + /* tmp[i] < 2^118 + 2^64 + 8 < 2^119 */ + felem_reduce(z_out, tmp); + + /* y' = alpha*(4*beta - x') - 8*gamma^2 */ + felem_scalar(beta, 4); + /* beta[i] < 4 * 2^57 = 2^59 */ + felem_diff(beta, x_out); + /* beta[i] < 2^59 + 2^58 + 2 < 2^60 */ + felem_mul(tmp, alpha, beta); + /* tmp[i] < 4 * 2^57 * 2^60 = 2^119 */ + felem_square(tmp2, gamma); + /* tmp2[i] < 4 * 2^57 * 2^57 = 2^116 */ + widefelem_scalar(tmp2, 8); + /* tmp2[i] < 8 * 2^116 = 2^119 */ + widefelem_diff(tmp, tmp2); + /* tmp[i] < 2^119 + 2^120 < 2^121 */ + felem_reduce(y_out, tmp); + } + +/* Add two elliptic curve points: + * (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where + * X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 - + * 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 + * Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 - X_3) - + * Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3 + * Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2) + * + * This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0. + */ + +/* This function is not entirely constant-time: + * it includes a branch for checking whether the two input points are equal, + * (while not equal to the point at infinity). + * This case never happens during single point multiplication, + * so there is no timing leak for ECDH or ECDSA signing. */ +static void point_add(felem x3, felem y3, felem z3, + const felem x1, const felem y1, const felem z1, + const int mixed, const felem x2, const felem y2, const felem z2) + { + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out; + widefelem tmp, tmp2; + limb z1_is_zero, z2_is_zero, x_equal, y_equal; + + if (!mixed) + { + /* ftmp2 = z2^2 */ + felem_square(tmp, z2); + felem_reduce(ftmp2, tmp); + + /* ftmp4 = z2^3 */ + felem_mul(tmp, ftmp2, z2); + felem_reduce(ftmp4, tmp); + + /* ftmp4 = z2^3*y1 */ + felem_mul(tmp2, ftmp4, y1); + felem_reduce(ftmp4, tmp2); + + /* ftmp2 = z2^2*x1 */ + felem_mul(tmp2, ftmp2, x1); + felem_reduce(ftmp2, tmp2); + } + else + { + /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ + + /* ftmp4 = z2^3*y1 */ + felem_assign(ftmp4, y1); + + /* ftmp2 = z2^2*x1 */ + felem_assign(ftmp2, x1); + } + + /* ftmp = z1^2 */ + felem_square(tmp, z1); + felem_reduce(ftmp, tmp); + + /* ftmp3 = z1^3 */ + felem_mul(tmp, ftmp, z1); + felem_reduce(ftmp3, tmp); + + /* tmp = z1^3*y2 */ + felem_mul(tmp, ftmp3, y2); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + + /* ftmp3 = z1^3*y2 - z2^3*y1 */ + felem_diff_128_64(tmp, ftmp4); + /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ + felem_reduce(ftmp3, tmp); + + /* tmp = z1^2*x2 */ + felem_mul(tmp, ftmp, x2); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + + /* ftmp = z1^2*x2 - z2^2*x1 */ + felem_diff_128_64(tmp, ftmp2); + /* tmp[i] < 2^116 + 2^64 + 8 < 2^117 */ + felem_reduce(ftmp, tmp); + + /* the formulae are incorrect if the points are equal + * so we check for this and do doubling if this happens */ + x_equal = felem_is_zero(ftmp); + y_equal = felem_is_zero(ftmp3); + z1_is_zero = felem_is_zero(z1); + z2_is_zero = felem_is_zero(z2); + /* In affine coordinates, (X_1, Y_1) == (X_2, Y_2) */ + if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) + { + point_double(x3, y3, z3, x1, y1, z1); + return; + } + + /* ftmp5 = z1*z2 */ + if (!mixed) + { + felem_mul(tmp, z1, z2); + felem_reduce(ftmp5, tmp); + } + else + { + /* special case z2 = 0 is handled later */ + felem_assign(ftmp5, z1); + } + + /* z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) */ + felem_mul(tmp, ftmp, ftmp5); + felem_reduce(z_out, tmp); + + /* ftmp = (z1^2*x2 - z2^2*x1)^2 */ + felem_assign(ftmp5, ftmp); + felem_square(tmp, ftmp); + felem_reduce(ftmp, tmp); + + /* ftmp5 = (z1^2*x2 - z2^2*x1)^3 */ + felem_mul(tmp, ftmp, ftmp5); + felem_reduce(ftmp5, tmp); + + /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ + felem_mul(tmp, ftmp2, ftmp); + felem_reduce(ftmp2, tmp); + + /* tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ + felem_mul(tmp, ftmp4, ftmp5); + /* tmp[i] < 4 * 2^57 * 2^57 = 2^116 */ + + /* tmp2 = (z1^3*y2 - z2^3*y1)^2 */ + felem_square(tmp2, ftmp3); + /* tmp2[i] < 4 * 2^57 * 2^57 < 2^116 */ + + /* tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 */ + felem_diff_128_64(tmp2, ftmp5); + /* tmp2[i] < 2^116 + 2^64 + 8 < 2^117 */ + + /* ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ + felem_assign(ftmp5, ftmp2); + felem_scalar(ftmp5, 2); + /* ftmp5[i] < 2 * 2^57 = 2^58 */ + + /* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 - + 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ + felem_diff_128_64(tmp2, ftmp5); + /* tmp2[i] < 2^117 + 2^64 + 8 < 2^118 */ + felem_reduce(x_out, tmp2); + + /* ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out */ + felem_diff(ftmp2, x_out); + /* ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 */ + + /* tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) */ + felem_mul(tmp2, ftmp3, ftmp2); + /* tmp2[i] < 4 * 2^57 * 2^59 = 2^118 */ + + /* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) - + z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ + widefelem_diff(tmp2, tmp); + /* tmp2[i] < 2^118 + 2^120 < 2^121 */ + felem_reduce(y_out, tmp2); + + /* the result (x_out, y_out, z_out) is incorrect if one of the inputs is + * the point at infinity, so we need to check for this separately */ + + /* if point 1 is at infinity, copy point 2 to output, and vice versa */ + copy_conditional(x_out, x2, z1_is_zero); + copy_conditional(x_out, x1, z2_is_zero); + copy_conditional(y_out, y2, z1_is_zero); + copy_conditional(y_out, y1, z2_is_zero); + copy_conditional(z_out, z2, z1_is_zero); + copy_conditional(z_out, z1, z2_is_zero); + felem_assign(x3, x_out); + felem_assign(y3, y_out); + felem_assign(z3, z_out); + } + +/* select_point selects the |idx|th point from a precomputation table and + * copies it to out. */ +static void select_point(const u64 idx, unsigned int size, const felem pre_comp[/*size*/][3], felem out[3]) + { + unsigned i, j; + limb *outlimbs = &out[0][0]; + memset(outlimbs, 0, 3 * sizeof(felem)); + + for (i = 0; i < size; i++) + { + const limb *inlimbs = &pre_comp[i][0][0]; + u64 mask = i ^ idx; + mask |= mask >> 4; + mask |= mask >> 2; + mask |= mask >> 1; + mask &= 1; + mask--; + for (j = 0; j < 4 * 3; j++) + outlimbs[j] |= inlimbs[j] & mask; + } + } + +/* get_bit returns the |i|th bit in |in| */ +static char get_bit(const felem_bytearray in, unsigned i) + { + if (i >= 224) + return 0; + return (in[i >> 3] >> (i & 7)) & 1; + } + +/* Interleaved point multiplication using precomputed point multiples: + * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], + * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple + * of the generator, using certain (large) precomputed multiples in g_pre_comp. + * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ +static void batch_mul(felem x_out, felem y_out, felem z_out, + const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, + const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[2][16][3]) + { + int i, skip; + unsigned num; + unsigned gen_mul = (g_scalar != NULL); + felem nq[3], tmp[4]; + u64 bits; + u8 sign, digit; + + /* set nq to the point at infinity */ + memset(nq, 0, 3 * sizeof(felem)); + + /* Loop over all scalars msb-to-lsb, interleaving additions + * of multiples of the generator (two in each of the last 28 rounds) + * and additions of other points multiples (every 5th round). + */ + skip = 1; /* save two point operations in the first round */ + for (i = (num_points ? 220 : 27); i >= 0; --i) + { + /* double */ + if (!skip) + point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + + /* add multiples of the generator */ + if (gen_mul && (i <= 27)) + { + /* first, look 28 bits upwards */ + bits = get_bit(g_scalar, i + 196) << 3; + bits |= get_bit(g_scalar, i + 140) << 2; + bits |= get_bit(g_scalar, i + 84) << 1; + bits |= get_bit(g_scalar, i + 28); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[1], tmp); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + + /* second, look at the current position */ + bits = get_bit(g_scalar, i + 168) << 3; + bits |= get_bit(g_scalar, i + 112) << 2; + bits |= get_bit(g_scalar, i + 56) << 1; + bits |= get_bit(g_scalar, i); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[0], tmp); + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + + /* do other additions every 5 doublings */ + if (num_points && (i % 5 == 0)) + { + /* loop over all scalars */ + for (num = 0; num < num_points; ++num) + { + bits = get_bit(scalars[num], i + 4) << 5; + bits |= get_bit(scalars[num], i + 3) << 4; + bits |= get_bit(scalars[num], i + 2) << 3; + bits |= get_bit(scalars[num], i + 1) << 2; + bits |= get_bit(scalars[num], i) << 1; + bits |= get_bit(scalars[num], i - 1); + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + /* select the point to add or subtract */ + select_point(digit, 17, pre_comp[num], tmp); + felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ + copy_conditional(tmp[1], tmp[3], sign); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + mixed, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + } + } + } + felem_assign(x_out, nq[0]); + felem_assign(y_out, nq[1]); + felem_assign(z_out, nq[2]); + } + +/******************************************************************************/ +/* FUNCTIONS TO MANAGE PRECOMPUTATION + */ + +static NISTP224_PRE_COMP *nistp224_pre_comp_new() + { + NISTP224_PRE_COMP *ret = NULL; + ret = (NISTP224_PRE_COMP *) OPENSSL_malloc(sizeof *ret); + if (!ret) + { + ECerr(EC_F_NISTP224_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); + return ret; + } + memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); + ret->references = 1; + return ret; + } + +static void *nistp224_pre_comp_dup(void *src_) + { + NISTP224_PRE_COMP *src = src_; + + /* no need to actually copy, these objects never change! */ + CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); + + return src_; + } + +static void nistp224_pre_comp_free(void *pre_) + { + int i; + NISTP224_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_free(pre); + } + +static void nistp224_pre_comp_clear_free(void *pre_) + { + int i; + NISTP224_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_cleanse(pre, sizeof *pre); + OPENSSL_free(pre); + } + +/******************************************************************************/ +/* OPENSSL EC_METHOD FUNCTIONS + */ + +int ec_GFp_nistp224_group_init(EC_GROUP *group) + { + int ret; + ret = ec_GFp_simple_group_init(group); + group->a_is_minus3 = 1; + return ret; + } + +int ec_GFp_nistp224_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) + { + int ret = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *curve_p, *curve_a, *curve_b; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((curve_p = BN_CTX_get(ctx)) == NULL) || + ((curve_a = BN_CTX_get(ctx)) == NULL) || + ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; + BN_bin2bn(nistp224_curve_params[0], sizeof(felem_bytearray), curve_p); + BN_bin2bn(nistp224_curve_params[1], sizeof(felem_bytearray), curve_a); + BN_bin2bn(nistp224_curve_params[2], sizeof(felem_bytearray), curve_b); + if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || + (BN_cmp(curve_b, b))) + { + ECerr(EC_F_EC_GFP_NISTP224_GROUP_SET_CURVE, + EC_R_WRONG_CURVE_PARAMETERS); + goto err; + } + group->field_mod_func = BN_nist_mod_224; + ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); +err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + +/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns + * (X', Y') = (X/Z^2, Y/Z^3) */ +int ec_GFp_nistp224_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) + { + felem z1, z2, x_in, y_in, x_out, y_out; + widefelem tmp; + + if (EC_POINT_is_at_infinity(group, point)) + { + ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, + EC_R_POINT_AT_INFINITY); + return 0; + } + if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || + (!BN_to_felem(z1, &point->Z))) return 0; + felem_inv(z2, z1); + felem_square(tmp, z2); felem_reduce(z1, tmp); + felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); + felem_contract(x_out, x_in); + if (x != NULL) + { + if (!felem_to_BN(x, x_out)) { + ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); + felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); + felem_contract(y_out, y_in); + if (y != NULL) + { + if (!felem_to_BN(y, y_out)) { + ECerr(EC_F_EC_GFP_NISTP224_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + return 1; + } + +static void make_points_affine(size_t num, felem points[/*num*/][3], felem tmp_felems[/*num+1*/]) + { + /* Runs in constant time, unless an input is the point at infinity + * (which normally shouldn't happen). */ + ec_GFp_nistp_points_make_affine_internal( + num, + points, + sizeof(felem), + tmp_felems, + (void (*)(void *)) felem_one, + (int (*)(const void *)) felem_is_zero_int, + (void (*)(void *, const void *)) felem_assign, + (void (*)(void *, const void *)) felem_square_reduce, + (void (*)(void *, const void *, const void *)) felem_mul_reduce, + (void (*)(void *, const void *)) felem_inv, + (void (*)(void *, const void *)) felem_contract); + } + +/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values + * Result is stored in r (r can equal one of the inputs). */ +int ec_GFp_nistp224_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx) + { + int ret = 0; + int j; + unsigned i; + int mixed = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *z, *tmp_scalar; + felem_bytearray g_secret; + felem_bytearray *secrets = NULL; + felem (*pre_comp)[17][3] = NULL; + felem *tmp_felems = NULL; + felem_bytearray tmp; + unsigned num_bytes; + int have_pre_comp = 0; + size_t num_points = num; + felem x_in, y_in, z_in, x_out, y_out, z_out; + NISTP224_PRE_COMP *pre = NULL; + const felem (*g_pre_comp)[16][3] = NULL; + EC_POINT *generator = NULL; + const EC_POINT *p = NULL; + const BIGNUM *p_scalar = NULL; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL) || + ((z = BN_CTX_get(ctx)) == NULL) || + ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) + goto err; + + if (scalar != NULL) + { + pre = EC_EX_DATA_get_data(group->extra_data, + nistp224_pre_comp_dup, nistp224_pre_comp_free, + nistp224_pre_comp_clear_free); + if (pre) + /* we have precomputation, try to use it */ + g_pre_comp = (const felem (*)[16][3]) pre->g_pre_comp; + else + /* try to use the standard precomputation */ + g_pre_comp = &gmul[0]; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + /* get the generator from precomputation */ + if (!felem_to_BN(x, g_pre_comp[0][1][0]) || + !felem_to_BN(y, g_pre_comp[0][1][1]) || + !felem_to_BN(z, g_pre_comp[0][1][2])) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + if (!EC_POINT_set_Jprojective_coordinates_GFp(group, + generator, x, y, z, ctx)) + goto err; + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + /* precomputation matches generator */ + have_pre_comp = 1; + else + /* we don't have valid precomputation: + * treat the generator as a random point */ + num_points = num_points + 1; + } + + if (num_points > 0) + { + if (num_points >= 3) + { + /* unless we precompute multiples for just one or two points, + * converting those into affine form is time well spent */ + mixed = 1; + } + secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); + pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); + if (mixed) + tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); + if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_MALLOC_FAILURE); + goto err; + } + + /* we treat NULL scalars as 0, and NULL points as points at infinity, + * i.e., they contribute nothing to the linear combination */ + memset(secrets, 0, num_points * sizeof(felem_bytearray)); + memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); + for (i = 0; i < num_points; ++i) + { + if (i == num) + /* the generator */ + { + p = EC_GROUP_get0_generator(group); + p_scalar = scalar; + } + else + /* the i^th point */ + { + p = points[i]; + p_scalar = scalars[i]; + } + if ((p_scalar != NULL) && (p != NULL)) + { + /* reduce scalar to 0 <= scalar < 2^224 */ + if ((BN_num_bits(p_scalar) > 224) || (BN_is_negative(p_scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(p_scalar, tmp); + flip_endian(secrets[i], tmp, num_bytes); + /* precompute multiples */ + if ((!BN_to_felem(x_out, &p->X)) || + (!BN_to_felem(y_out, &p->Y)) || + (!BN_to_felem(z_out, &p->Z))) goto err; + felem_assign(pre_comp[i][1][0], x_out); + felem_assign(pre_comp[i][1][1], y_out); + felem_assign(pre_comp[i][1][2], z_out); + for (j = 2; j <= 16; ++j) + { + if (j & 1) + { + point_add( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], + 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); + } + else + { + point_double( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); + } + } + } + } + if (mixed) + make_points_affine(num_points * 17, pre_comp[0], tmp_felems); + } + + /* the scalar for the generator */ + if ((scalar != NULL) && (have_pre_comp)) + { + memset(g_secret, 0, sizeof g_secret); + /* reduce scalar to 0 <= scalar < 2^224 */ + if ((BN_num_bits(scalar) > 224) || (BN_is_negative(scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(scalar, tmp); + flip_endian(g_secret, tmp, num_bytes); + /* do the multiplication with generator precomputation*/ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + g_secret, + mixed, (const felem (*)[17][3]) pre_comp, + g_pre_comp); + } + else + /* do the multiplication without generator precomputation */ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); + /* reduce the output to its unique minimal representation */ + felem_contract(x_in, x_out); + felem_contract(y_in, y_out); + felem_contract(z_in, z_out); + if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || + (!felem_to_BN(z, z_in))) + { + ECerr(EC_F_EC_GFP_NISTP224_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); + +err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (secrets != NULL) + OPENSSL_free(secrets); + if (pre_comp != NULL) + OPENSSL_free(pre_comp); + if (tmp_felems != NULL) + OPENSSL_free(tmp_felems); + return ret; + } + +int ec_GFp_nistp224_precompute_mult(EC_GROUP *group, BN_CTX *ctx) + { + int ret = 0; + NISTP224_PRE_COMP *pre = NULL; + int i, j; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + EC_POINT *generator = NULL; + felem tmp_felems[32]; + + /* throw away old precomputation */ + EC_EX_DATA_free_data(&group->extra_data, nistp224_pre_comp_dup, + nistp224_pre_comp_free, nistp224_pre_comp_clear_free); + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL)) + goto err; + /* get the generator */ + if (group->generator == NULL) goto err; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + BN_bin2bn(nistp224_curve_params[3], sizeof (felem_bytearray), x); + BN_bin2bn(nistp224_curve_params[4], sizeof (felem_bytearray), y); + if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) + goto err; + if ((pre = nistp224_pre_comp_new()) == NULL) + goto err; + /* if the generator is the standard one, use built-in precomputation */ + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + { + memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); + ret = 1; + goto err; + } + if ((!BN_to_felem(pre->g_pre_comp[0][1][0], &group->generator->X)) || + (!BN_to_felem(pre->g_pre_comp[0][1][1], &group->generator->Y)) || + (!BN_to_felem(pre->g_pre_comp[0][1][2], &group->generator->Z))) + goto err; + /* compute 2^56*G, 2^112*G, 2^168*G for the first table, + * 2^28*G, 2^84*G, 2^140*G, 2^196*G for the second one + */ + for (i = 1; i <= 8; i <<= 1) + { + point_double( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); + for (j = 0; j < 27; ++j) + { + point_double( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + } + if (i == 8) + break; + point_double( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + for (j = 0; j < 27; ++j) + { + point_double( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); + } + } + for (i = 0; i < 2; i++) + { + /* g_pre_comp[i][0] is the point at infinity */ + memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); + /* the remaining multiples */ + /* 2^56*G + 2^112*G resp. 2^84*G + 2^140*G */ + point_add( + pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], + pre->g_pre_comp[i][6][2], pre->g_pre_comp[i][4][0], + pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], + 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], + pre->g_pre_comp[i][2][2]); + /* 2^56*G + 2^168*G resp. 2^84*G + 2^196*G */ + point_add( + pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], + pre->g_pre_comp[i][10][2], pre->g_pre_comp[i][8][0], + pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], + pre->g_pre_comp[i][2][2]); + /* 2^112*G + 2^168*G resp. 2^140*G + 2^196*G */ + point_add( + pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], + pre->g_pre_comp[i][12][2], pre->g_pre_comp[i][8][0], + pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + 0, pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], + pre->g_pre_comp[i][4][2]); + /* 2^56*G + 2^112*G + 2^168*G resp. 2^84*G + 2^140*G + 2^196*G */ + point_add( + pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], + pre->g_pre_comp[i][14][2], pre->g_pre_comp[i][12][0], + pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], + 0, pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], + pre->g_pre_comp[i][2][2]); + for (j = 1; j < 8; ++j) + { + /* odd multiples: add G resp. 2^28*G */ + point_add( + pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], + pre->g_pre_comp[i][2*j+1][2], pre->g_pre_comp[i][2*j][0], + pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], + 0, pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], + pre->g_pre_comp[i][1][2]); + } + } + make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_felems); + + if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp224_pre_comp_dup, + nistp224_pre_comp_free, nistp224_pre_comp_clear_free)) + goto err; + ret = 1; + pre = NULL; + err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (pre) + nistp224_pre_comp_free(pre); + return ret; + } + +int ec_GFp_nistp224_have_precompute_mult(const EC_GROUP *group) + { + if (EC_EX_DATA_get_data(group->extra_data, nistp224_pre_comp_dup, + nistp224_pre_comp_free, nistp224_pre_comp_clear_free) + != NULL) + return 1; + else + return 0; + } + +#else +static void *dummy=&dummy; +#endif diff --git a/src/lib/libcrypto/ec/ecp_nistp256.c b/src/lib/libcrypto/ec/ecp_nistp256.c new file mode 100644 index 0000000000..4bc0f5dce0 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp256.c @@ -0,0 +1,2171 @@ +/* crypto/ec/ecp_nistp256.c */ +/* + * Written by Adam Langley (Google) for the OpenSSL project + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A 64-bit implementation of the NIST P-256 elliptic curve point multiplication + * + * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. + * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 + * work which got its smarts from Daniel J. Bernstein's work on the same. + */ + +#include +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +#ifndef OPENSSL_SYS_VMS +#include +#else +#include +#endif + +#include +#include +#include "ec_lcl.h" + +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) + /* even with gcc, the typedef won't work for 32-bit platforms */ + typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ + typedef __int128_t int128_t; +#else + #error "Need GCC 3.1 or later to define type uint128_t" +#endif + +typedef uint8_t u8; +typedef uint32_t u32; +typedef uint64_t u64; +typedef int64_t s64; + +/* The underlying field. + * + * P256 operates over GF(2^256-2^224+2^192+2^96-1). We can serialise an element + * of this field into 32 bytes. We call this an felem_bytearray. */ + +typedef u8 felem_bytearray[32]; + +/* These are the parameters of P256, taken from FIPS 186-3, page 86. These + * values are big-endian. */ +static const felem_bytearray nistp256_curve_params[5] = { + {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* p */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, /* a = -3 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfc}, /* b */ + {0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7, + 0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc, + 0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6, + 0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b}, + {0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, /* x */ + 0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, + 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33, 0xa0, + 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96}, + {0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, /* y */ + 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e, 0x16, + 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, + 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5} +}; + +/* The representation of field elements. + * ------------------------------------ + * + * We represent field elements with either four 128-bit values, eight 128-bit + * values, or four 64-bit values. The field element represented is: + * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + v[3]*2^192 (mod p) + * or: + * v[0]*2^0 + v[1]*2^64 + v[2]*2^128 + ... + v[8]*2^512 (mod p) + * + * 128-bit values are called 'limbs'. Since the limbs are spaced only 64 bits + * apart, but are 128-bits wide, the most significant bits of each limb overlap + * with the least significant bits of the next. + * + * A field element with four limbs is an 'felem'. One with eight limbs is a + * 'longfelem' + * + * A field element with four, 64-bit values is called a 'smallfelem'. Small + * values are used as intermediate values before multiplication. + */ + +#define NLIMBS 4 + +typedef uint128_t limb; +typedef limb felem[NLIMBS]; +typedef limb longfelem[NLIMBS * 2]; +typedef u64 smallfelem[NLIMBS]; + +/* This is the value of the prime as four 64-bit words, little-endian. */ +static const u64 kPrime[4] = { 0xfffffffffffffffful, 0xffffffff, 0, 0xffffffff00000001ul }; +static const limb bottom32bits = 0xffffffff; +static const u64 bottom63bits = 0x7ffffffffffffffful; + +/* bin32_to_felem takes a little-endian byte array and converts it into felem + * form. This assumes that the CPU is little-endian. */ +static void bin32_to_felem(felem out, const u8 in[32]) + { + out[0] = *((u64*) &in[0]); + out[1] = *((u64*) &in[8]); + out[2] = *((u64*) &in[16]); + out[3] = *((u64*) &in[24]); + } + +/* smallfelem_to_bin32 takes a smallfelem and serialises into a little endian, + * 32 byte array. This assumes that the CPU is little-endian. */ +static void smallfelem_to_bin32(u8 out[32], const smallfelem in) + { + *((u64*) &out[0]) = in[0]; + *((u64*) &out[8]) = in[1]; + *((u64*) &out[16]) = in[2]; + *((u64*) &out[24]) = in[3]; + } + +/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ +static void flip_endian(u8 *out, const u8 *in, unsigned len) + { + unsigned i; + for (i = 0; i < len; ++i) + out[i] = in[len-1-i]; + } + +/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ +static int BN_to_felem(felem out, const BIGNUM *bn) + { + felem_bytearray b_in; + felem_bytearray b_out; + unsigned num_bytes; + + /* BN_bn2bin eats leading zeroes */ + memset(b_out, 0, sizeof b_out); + num_bytes = BN_num_bytes(bn); + if (num_bytes > sizeof b_out) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + if (BN_is_negative(bn)) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2bin(bn, b_in); + flip_endian(b_out, b_in, num_bytes); + bin32_to_felem(out, b_out); + return 1; + } + +/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ +static BIGNUM *smallfelem_to_BN(BIGNUM *out, const smallfelem in) + { + felem_bytearray b_in, b_out; + smallfelem_to_bin32(b_in, in); + flip_endian(b_out, b_in, sizeof b_out); + return BN_bin2bn(b_out, sizeof b_out, out); + } + + +/* Field operations + * ---------------- */ + +static void smallfelem_one(smallfelem out) + { + out[0] = 1; + out[1] = 0; + out[2] = 0; + out[3] = 0; + } + +static void smallfelem_assign(smallfelem out, const smallfelem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +static void felem_assign(felem out, const felem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +/* felem_sum sets out = out + in. */ +static void felem_sum(felem out, const felem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + } + +/* felem_small_sum sets out = out + in. */ +static void felem_small_sum(felem out, const smallfelem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + } + +/* felem_scalar sets out = out * scalar */ +static void felem_scalar(felem out, const u64 scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + } + +/* longfelem_scalar sets out = out * scalar */ +static void longfelem_scalar(longfelem out, const u64 scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + out[7] *= scalar; + } + +#define two105m41m9 (((limb)1) << 105) - (((limb)1) << 41) - (((limb)1) << 9) +#define two105 (((limb)1) << 105) +#define two105m41p9 (((limb)1) << 105) - (((limb)1) << 41) + (((limb)1) << 9) + +/* zero105 is 0 mod p */ +static const felem zero105 = { two105m41m9, two105, two105m41p9, two105m41p9 }; + +/* smallfelem_neg sets |out| to |-small| + * On exit: + * out[i] < out[i] + 2^105 + */ +static void smallfelem_neg(felem out, const smallfelem small) + { + /* In order to prevent underflow, we subtract from 0 mod p. */ + out[0] = zero105[0] - small[0]; + out[1] = zero105[1] - small[1]; + out[2] = zero105[2] - small[2]; + out[3] = zero105[3] - small[3]; + } + +/* felem_diff subtracts |in| from |out| + * On entry: + * in[i] < 2^104 + * On exit: + * out[i] < out[i] + 2^105 + */ +static void felem_diff(felem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + out[0] += zero105[0]; + out[1] += zero105[1]; + out[2] += zero105[2]; + out[3] += zero105[3]; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +#define two107m43m11 (((limb)1) << 107) - (((limb)1) << 43) - (((limb)1) << 11) +#define two107 (((limb)1) << 107) +#define two107m43p11 (((limb)1) << 107) - (((limb)1) << 43) + (((limb)1) << 11) + +/* zero107 is 0 mod p */ +static const felem zero107 = { two107m43m11, two107, two107m43p11, two107m43p11 }; + +/* An alternative felem_diff for larger inputs |in| + * felem_diff_zero107 subtracts |in| from |out| + * On entry: + * in[i] < 2^106 + * On exit: + * out[i] < out[i] + 2^107 + */ +static void felem_diff_zero107(felem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + out[0] += zero107[0]; + out[1] += zero107[1]; + out[2] += zero107[2]; + out[3] += zero107[3]; + + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + } + +/* longfelem_diff subtracts |in| from |out| + * On entry: + * in[i] < 7*2^67 + * On exit: + * out[i] < out[i] + 2^70 + 2^40 + */ +static void longfelem_diff(longfelem out, const longfelem in) + { + static const limb two70m8p6 = (((limb)1) << 70) - (((limb)1) << 8) + (((limb)1) << 6); + static const limb two70p40 = (((limb)1) << 70) + (((limb)1) << 40); + static const limb two70 = (((limb)1) << 70); + static const limb two70m40m38p6 = (((limb)1) << 70) - (((limb)1) << 40) - (((limb)1) << 38) + (((limb)1) << 6); + static const limb two70m6 = (((limb)1) << 70) - (((limb)1) << 6); + + /* add 0 mod p to avoid underflow */ + out[0] += two70m8p6; + out[1] += two70p40; + out[2] += two70; + out[3] += two70m40m38p6; + out[4] += two70m6; + out[5] += two70m6; + out[6] += two70m6; + out[7] += two70m6; + + /* in[i] < 7*2^67 < 2^70 - 2^40 - 2^38 + 2^6 */ + out[0] -= in[0]; + out[1] -= in[1]; + out[2] -= in[2]; + out[3] -= in[3]; + out[4] -= in[4]; + out[5] -= in[5]; + out[6] -= in[6]; + out[7] -= in[7]; + } + +#define two64m0 (((limb)1) << 64) - 1 +#define two110p32m0 (((limb)1) << 110) + (((limb)1) << 32) - 1 +#define two64m46 (((limb)1) << 64) - (((limb)1) << 46) +#define two64m32 (((limb)1) << 64) - (((limb)1) << 32) + +/* zero110 is 0 mod p */ +static const felem zero110 = { two64m0, two110p32m0, two64m46, two64m32 }; + +/* felem_shrink converts an felem into a smallfelem. The result isn't quite + * minimal as the value may be greater than p. + * + * On entry: + * in[i] < 2^109 + * On exit: + * out[i] < 2^64 + */ +static void felem_shrink(smallfelem out, const felem in) + { + felem tmp; + u64 a, b, mask; + s64 high, low; + static const u64 kPrime3Test = 0x7fffffff00000001ul; /* 2^63 - 2^32 + 1 */ + + /* Carry 2->3 */ + tmp[3] = zero110[3] + in[3] + ((u64) (in[2] >> 64)); + /* tmp[3] < 2^110 */ + + tmp[2] = zero110[2] + (u64) in[2]; + tmp[0] = zero110[0] + in[0]; + tmp[1] = zero110[1] + in[1]; + /* tmp[0] < 2**110, tmp[1] < 2^111, tmp[2] < 2**65 */ + + /* We perform two partial reductions where we eliminate the + * high-word of tmp[3]. We don't update the other words till the end. + */ + a = tmp[3] >> 64; /* a < 2^46 */ + tmp[3] = (u64) tmp[3]; + tmp[3] -= a; + tmp[3] += ((limb)a) << 32; + /* tmp[3] < 2^79 */ + + b = a; + a = tmp[3] >> 64; /* a < 2^15 */ + b += a; /* b < 2^46 + 2^15 < 2^47 */ + tmp[3] = (u64) tmp[3]; + tmp[3] -= a; + tmp[3] += ((limb)a) << 32; + /* tmp[3] < 2^64 + 2^47 */ + + /* This adjusts the other two words to complete the two partial + * reductions. */ + tmp[0] += b; + tmp[1] -= (((limb)b) << 32); + + /* In order to make space in tmp[3] for the carry from 2 -> 3, we + * conditionally subtract kPrime if tmp[3] is large enough. */ + high = tmp[3] >> 64; + /* As tmp[3] < 2^65, high is either 1 or 0 */ + high <<= 63; + high >>= 63; + /* high is: + * all ones if the high word of tmp[3] is 1 + * all zeros if the high word of tmp[3] if 0 */ + low = tmp[3]; + mask = low >> 63; + /* mask is: + * all ones if the MSB of low is 1 + * all zeros if the MSB of low if 0 */ + low &= bottom63bits; + low -= kPrime3Test; + /* if low was greater than kPrime3Test then the MSB is zero */ + low = ~low; + low >>= 63; + /* low is: + * all ones if low was > kPrime3Test + * all zeros if low was <= kPrime3Test */ + mask = (mask & low) | high; + tmp[0] -= mask & kPrime[0]; + tmp[1] -= mask & kPrime[1]; + /* kPrime[2] is zero, so omitted */ + tmp[3] -= mask & kPrime[3]; + /* tmp[3] < 2**64 - 2**32 + 1 */ + + tmp[1] += ((u64) (tmp[0] >> 64)); tmp[0] = (u64) tmp[0]; + tmp[2] += ((u64) (tmp[1] >> 64)); tmp[1] = (u64) tmp[1]; + tmp[3] += ((u64) (tmp[2] >> 64)); tmp[2] = (u64) tmp[2]; + /* tmp[i] < 2^64 */ + + out[0] = tmp[0]; + out[1] = tmp[1]; + out[2] = tmp[2]; + out[3] = tmp[3]; + } + +/* smallfelem_expand converts a smallfelem to an felem */ +static void smallfelem_expand(felem out, const smallfelem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + } + +/* smallfelem_square sets |out| = |small|^2 + * On entry: + * small[i] < 2^64 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void smallfelem_square(longfelem out, const smallfelem small) + { + limb a; + u64 high, low; + + a = ((uint128_t) small[0]) * small[0]; + low = a; + high = a >> 64; + out[0] = low; + out[1] = high; + + a = ((uint128_t) small[0]) * small[1]; + low = a; + high = a >> 64; + out[1] += low; + out[1] += low; + out[2] = high; + + a = ((uint128_t) small[0]) * small[2]; + low = a; + high = a >> 64; + out[2] += low; + out[2] *= 2; + out[3] = high; + + a = ((uint128_t) small[0]) * small[3]; + low = a; + high = a >> 64; + out[3] += low; + out[4] = high; + + a = ((uint128_t) small[1]) * small[2]; + low = a; + high = a >> 64; + out[3] += low; + out[3] *= 2; + out[4] += high; + + a = ((uint128_t) small[1]) * small[1]; + low = a; + high = a >> 64; + out[2] += low; + out[3] += high; + + a = ((uint128_t) small[1]) * small[3]; + low = a; + high = a >> 64; + out[4] += low; + out[4] *= 2; + out[5] = high; + + a = ((uint128_t) small[2]) * small[3]; + low = a; + high = a >> 64; + out[5] += low; + out[5] *= 2; + out[6] = high; + out[6] += high; + + a = ((uint128_t) small[2]) * small[2]; + low = a; + high = a >> 64; + out[4] += low; + out[5] += high; + + a = ((uint128_t) small[3]) * small[3]; + low = a; + high = a >> 64; + out[6] += low; + out[7] = high; + } + +/* felem_square sets |out| = |in|^2 + * On entry: + * in[i] < 2^109 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void felem_square(longfelem out, const felem in) + { + u64 small[4]; + felem_shrink(small, in); + smallfelem_square(out, small); + } + +/* smallfelem_mul sets |out| = |small1| * |small2| + * On entry: + * small1[i] < 2^64 + * small2[i] < 2^64 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void smallfelem_mul(longfelem out, const smallfelem small1, const smallfelem small2) + { + limb a; + u64 high, low; + + a = ((uint128_t) small1[0]) * small2[0]; + low = a; + high = a >> 64; + out[0] = low; + out[1] = high; + + + a = ((uint128_t) small1[0]) * small2[1]; + low = a; + high = a >> 64; + out[1] += low; + out[2] = high; + + a = ((uint128_t) small1[1]) * small2[0]; + low = a; + high = a >> 64; + out[1] += low; + out[2] += high; + + + a = ((uint128_t) small1[0]) * small2[2]; + low = a; + high = a >> 64; + out[2] += low; + out[3] = high; + + a = ((uint128_t) small1[1]) * small2[1]; + low = a; + high = a >> 64; + out[2] += low; + out[3] += high; + + a = ((uint128_t) small1[2]) * small2[0]; + low = a; + high = a >> 64; + out[2] += low; + out[3] += high; + + + a = ((uint128_t) small1[0]) * small2[3]; + low = a; + high = a >> 64; + out[3] += low; + out[4] = high; + + a = ((uint128_t) small1[1]) * small2[2]; + low = a; + high = a >> 64; + out[3] += low; + out[4] += high; + + a = ((uint128_t) small1[2]) * small2[1]; + low = a; + high = a >> 64; + out[3] += low; + out[4] += high; + + a = ((uint128_t) small1[3]) * small2[0]; + low = a; + high = a >> 64; + out[3] += low; + out[4] += high; + + + a = ((uint128_t) small1[1]) * small2[3]; + low = a; + high = a >> 64; + out[4] += low; + out[5] = high; + + a = ((uint128_t) small1[2]) * small2[2]; + low = a; + high = a >> 64; + out[4] += low; + out[5] += high; + + a = ((uint128_t) small1[3]) * small2[1]; + low = a; + high = a >> 64; + out[4] += low; + out[5] += high; + + + a = ((uint128_t) small1[2]) * small2[3]; + low = a; + high = a >> 64; + out[5] += low; + out[6] = high; + + a = ((uint128_t) small1[3]) * small2[2]; + low = a; + high = a >> 64; + out[5] += low; + out[6] += high; + + + a = ((uint128_t) small1[3]) * small2[3]; + low = a; + high = a >> 64; + out[6] += low; + out[7] = high; + } + +/* felem_mul sets |out| = |in1| * |in2| + * On entry: + * in1[i] < 2^109 + * in2[i] < 2^109 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void felem_mul(longfelem out, const felem in1, const felem in2) + { + smallfelem small1, small2; + felem_shrink(small1, in1); + felem_shrink(small2, in2); + smallfelem_mul(out, small1, small2); + } + +/* felem_small_mul sets |out| = |small1| * |in2| + * On entry: + * small1[i] < 2^64 + * in2[i] < 2^109 + * On exit: + * out[i] < 7 * 2^64 < 2^67 + */ +static void felem_small_mul(longfelem out, const smallfelem small1, const felem in2) + { + smallfelem small2; + felem_shrink(small2, in2); + smallfelem_mul(out, small1, small2); + } + +#define two100m36m4 (((limb)1) << 100) - (((limb)1) << 36) - (((limb)1) << 4) +#define two100 (((limb)1) << 100) +#define two100m36p4 (((limb)1) << 100) - (((limb)1) << 36) + (((limb)1) << 4) +/* zero100 is 0 mod p */ +static const felem zero100 = { two100m36m4, two100, two100m36p4, two100m36p4 }; + +/* Internal function for the different flavours of felem_reduce. + * felem_reduce_ reduces the higher coefficients in[4]-in[7]. + * On entry: + * out[0] >= in[6] + 2^32*in[6] + in[7] + 2^32*in[7] + * out[1] >= in[7] + 2^32*in[4] + * out[2] >= in[5] + 2^32*in[5] + * out[3] >= in[4] + 2^32*in[5] + 2^32*in[6] + * On exit: + * out[0] <= out[0] + in[4] + 2^32*in[5] + * out[1] <= out[1] + in[5] + 2^33*in[6] + * out[2] <= out[2] + in[7] + 2*in[6] + 2^33*in[7] + * out[3] <= out[3] + 2^32*in[4] + 3*in[7] + */ +static void felem_reduce_(felem out, const longfelem in) + { + int128_t c; + /* combine common terms from below */ + c = in[4] + (in[5] << 32); + out[0] += c; + out[3] -= c; + + c = in[5] - in[7]; + out[1] += c; + out[2] -= c; + + /* the remaining terms */ + /* 256: [(0,1),(96,-1),(192,-1),(224,1)] */ + out[1] -= (in[4] << 32); + out[3] += (in[4] << 32); + + /* 320: [(32,1),(64,1),(128,-1),(160,-1),(224,-1)] */ + out[2] -= (in[5] << 32); + + /* 384: [(0,-1),(32,-1),(96,2),(128,2),(224,-1)] */ + out[0] -= in[6]; + out[0] -= (in[6] << 32); + out[1] += (in[6] << 33); + out[2] += (in[6] * 2); + out[3] -= (in[6] << 32); + + /* 448: [(0,-1),(32,-1),(64,-1),(128,1),(160,2),(192,3)] */ + out[0] -= in[7]; + out[0] -= (in[7] << 32); + out[2] += (in[7] << 33); + out[3] += (in[7] * 3); + } + +/* felem_reduce converts a longfelem into an felem. + * To be called directly after felem_square or felem_mul. + * On entry: + * in[0] < 2^64, in[1] < 3*2^64, in[2] < 5*2^64, in[3] < 7*2^64 + * in[4] < 7*2^64, in[5] < 5*2^64, in[6] < 3*2^64, in[7] < 2*64 + * On exit: + * out[i] < 2^101 + */ +static void felem_reduce(felem out, const longfelem in) + { + out[0] = zero100[0] + in[0]; + out[1] = zero100[1] + in[1]; + out[2] = zero100[2] + in[2]; + out[3] = zero100[3] + in[3]; + + felem_reduce_(out, in); + + /* out[0] > 2^100 - 2^36 - 2^4 - 3*2^64 - 3*2^96 - 2^64 - 2^96 > 0 + * out[1] > 2^100 - 2^64 - 7*2^96 > 0 + * out[2] > 2^100 - 2^36 + 2^4 - 5*2^64 - 5*2^96 > 0 + * out[3] > 2^100 - 2^36 + 2^4 - 7*2^64 - 5*2^96 - 3*2^96 > 0 + * + * out[0] < 2^100 + 2^64 + 7*2^64 + 5*2^96 < 2^101 + * out[1] < 2^100 + 3*2^64 + 5*2^64 + 3*2^97 < 2^101 + * out[2] < 2^100 + 5*2^64 + 2^64 + 3*2^65 + 2^97 < 2^101 + * out[3] < 2^100 + 7*2^64 + 7*2^96 + 3*2^64 < 2^101 + */ + } + +/* felem_reduce_zero105 converts a larger longfelem into an felem. + * On entry: + * in[0] < 2^71 + * On exit: + * out[i] < 2^106 + */ +static void felem_reduce_zero105(felem out, const longfelem in) + { + out[0] = zero105[0] + in[0]; + out[1] = zero105[1] + in[1]; + out[2] = zero105[2] + in[2]; + out[3] = zero105[3] + in[3]; + + felem_reduce_(out, in); + + /* out[0] > 2^105 - 2^41 - 2^9 - 2^71 - 2^103 - 2^71 - 2^103 > 0 + * out[1] > 2^105 - 2^71 - 2^103 > 0 + * out[2] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 > 0 + * out[3] > 2^105 - 2^41 + 2^9 - 2^71 - 2^103 - 2^103 > 0 + * + * out[0] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 + * out[1] < 2^105 + 2^71 + 2^71 + 2^103 < 2^106 + * out[2] < 2^105 + 2^71 + 2^71 + 2^71 + 2^103 < 2^106 + * out[3] < 2^105 + 2^71 + 2^103 + 2^71 < 2^106 + */ + } + +/* subtract_u64 sets *result = *result - v and *carry to one if the subtraction + * underflowed. */ +static void subtract_u64(u64* result, u64* carry, u64 v) + { + uint128_t r = *result; + r -= v; + *carry = (r >> 64) & 1; + *result = (u64) r; + } + +/* felem_contract converts |in| to its unique, minimal representation. + * On entry: + * in[i] < 2^109 + */ +static void felem_contract(smallfelem out, const felem in) + { + unsigned i; + u64 all_equal_so_far = 0, result = 0, carry; + + felem_shrink(out, in); + /* small is minimal except that the value might be > p */ + + all_equal_so_far--; + /* We are doing a constant time test if out >= kPrime. We need to + * compare each u64, from most-significant to least significant. For + * each one, if all words so far have been equal (m is all ones) then a + * non-equal result is the answer. Otherwise we continue. */ + for (i = 3; i < 4; i--) + { + u64 equal; + uint128_t a = ((uint128_t) kPrime[i]) - out[i]; + /* if out[i] > kPrime[i] then a will underflow and the high + * 64-bits will all be set. */ + result |= all_equal_so_far & ((u64) (a >> 64)); + + /* if kPrime[i] == out[i] then |equal| will be all zeros and + * the decrement will make it all ones. */ + equal = kPrime[i] ^ out[i]; + equal--; + equal &= equal << 32; + equal &= equal << 16; + equal &= equal << 8; + equal &= equal << 4; + equal &= equal << 2; + equal &= equal << 1; + equal = ((s64) equal) >> 63; + + all_equal_so_far &= equal; + } + + /* if all_equal_so_far is still all ones then the two values are equal + * and so out >= kPrime is true. */ + result |= all_equal_so_far; + + /* if out >= kPrime then we subtract kPrime. */ + subtract_u64(&out[0], &carry, result & kPrime[0]); + subtract_u64(&out[1], &carry, carry); + subtract_u64(&out[2], &carry, carry); + subtract_u64(&out[3], &carry, carry); + + subtract_u64(&out[1], &carry, result & kPrime[1]); + subtract_u64(&out[2], &carry, carry); + subtract_u64(&out[3], &carry, carry); + + subtract_u64(&out[2], &carry, result & kPrime[2]); + subtract_u64(&out[3], &carry, carry); + + subtract_u64(&out[3], &carry, result & kPrime[3]); + } + +static void smallfelem_square_contract(smallfelem out, const smallfelem in) + { + longfelem longtmp; + felem tmp; + + smallfelem_square(longtmp, in); + felem_reduce(tmp, longtmp); + felem_contract(out, tmp); + } + +static void smallfelem_mul_contract(smallfelem out, const smallfelem in1, const smallfelem in2) + { + longfelem longtmp; + felem tmp; + + smallfelem_mul(longtmp, in1, in2); + felem_reduce(tmp, longtmp); + felem_contract(out, tmp); + } + +/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 + * otherwise. + * On entry: + * small[i] < 2^64 + */ +static limb smallfelem_is_zero(const smallfelem small) + { + limb result; + u64 is_p; + + u64 is_zero = small[0] | small[1] | small[2] | small[3]; + is_zero--; + is_zero &= is_zero << 32; + is_zero &= is_zero << 16; + is_zero &= is_zero << 8; + is_zero &= is_zero << 4; + is_zero &= is_zero << 2; + is_zero &= is_zero << 1; + is_zero = ((s64) is_zero) >> 63; + + is_p = (small[0] ^ kPrime[0]) | + (small[1] ^ kPrime[1]) | + (small[2] ^ kPrime[2]) | + (small[3] ^ kPrime[3]); + is_p--; + is_p &= is_p << 32; + is_p &= is_p << 16; + is_p &= is_p << 8; + is_p &= is_p << 4; + is_p &= is_p << 2; + is_p &= is_p << 1; + is_p = ((s64) is_p) >> 63; + + is_zero |= is_p; + + result = is_zero; + result |= ((limb) is_zero) << 64; + return result; + } + +static int smallfelem_is_zero_int(const smallfelem small) + { + return (int) (smallfelem_is_zero(small) & ((limb)1)); + } + +/* felem_inv calculates |out| = |in|^{-1} + * + * Based on Fermat's Little Theorem: + * a^p = a (mod p) + * a^{p-1} = 1 (mod p) + * a^{p-2} = a^{-1} (mod p) + */ +static void felem_inv(felem out, const felem in) + { + felem ftmp, ftmp2; + /* each e_I will hold |in|^{2^I - 1} */ + felem e2, e4, e8, e16, e32, e64; + longfelem tmp; + unsigned i; + + felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ + felem_assign(e2, ftmp); + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^2 */ + felem_mul(tmp, ftmp, e2); felem_reduce(ftmp, tmp); /* 2^4 - 2^0 */ + felem_assign(e4, ftmp); + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^5 - 2^1 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^6 - 2^2 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^7 - 2^3 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^8 - 2^4 */ + felem_mul(tmp, ftmp, e4); felem_reduce(ftmp, tmp); /* 2^8 - 2^0 */ + felem_assign(e8, ftmp); + for (i = 0; i < 8; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^16 - 2^8 */ + felem_mul(tmp, ftmp, e8); felem_reduce(ftmp, tmp); /* 2^16 - 2^0 */ + felem_assign(e16, ftmp); + for (i = 0; i < 16; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^32 - 2^16 */ + felem_mul(tmp, ftmp, e16); felem_reduce(ftmp, tmp); /* 2^32 - 2^0 */ + felem_assign(e32, ftmp); + for (i = 0; i < 32; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^64 - 2^32 */ + felem_assign(e64, ftmp); + felem_mul(tmp, ftmp, in); felem_reduce(ftmp, tmp); /* 2^64 - 2^32 + 2^0 */ + for (i = 0; i < 192; i++) { + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); + } /* 2^256 - 2^224 + 2^192 */ + + felem_mul(tmp, e64, e32); felem_reduce(ftmp2, tmp); /* 2^64 - 2^0 */ + for (i = 0; i < 16; i++) { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } /* 2^80 - 2^16 */ + felem_mul(tmp, ftmp2, e16); felem_reduce(ftmp2, tmp); /* 2^80 - 2^0 */ + for (i = 0; i < 8; i++) { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } /* 2^88 - 2^8 */ + felem_mul(tmp, ftmp2, e8); felem_reduce(ftmp2, tmp); /* 2^88 - 2^0 */ + for (i = 0; i < 4; i++) { + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); + } /* 2^92 - 2^4 */ + felem_mul(tmp, ftmp2, e4); felem_reduce(ftmp2, tmp); /* 2^92 - 2^0 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^93 - 2^1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^2 */ + felem_mul(tmp, ftmp2, e2); felem_reduce(ftmp2, tmp); /* 2^94 - 2^0 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^95 - 2^1 */ + felem_square(tmp, ftmp2); felem_reduce(ftmp2, tmp); /* 2^96 - 2^2 */ + felem_mul(tmp, ftmp2, in); felem_reduce(ftmp2, tmp); /* 2^96 - 3 */ + + felem_mul(tmp, ftmp2, ftmp); felem_reduce(out, tmp); /* 2^256 - 2^224 + 2^192 + 2^96 - 3 */ + } + +static void smallfelem_inv_contract(smallfelem out, const smallfelem in) + { + felem tmp; + + smallfelem_expand(tmp, in); + felem_inv(tmp, tmp); + felem_contract(out, tmp); + } + +/* Group operations + * ---------------- + * + * Building on top of the field operations we have the operations on the + * elliptic curve group itself. Points on the curve are represented in Jacobian + * coordinates */ + +/* point_double calculates 2*(x_in, y_in, z_in) + * + * The method is taken from: + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b + * + * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. + * while x_out == y_in is not (maybe this works, but it's not tested). */ +static void +point_double(felem x_out, felem y_out, felem z_out, + const felem x_in, const felem y_in, const felem z_in) + { + longfelem tmp, tmp2; + felem delta, gamma, beta, alpha, ftmp, ftmp2; + smallfelem small1, small2; + + felem_assign(ftmp, x_in); + /* ftmp[i] < 2^106 */ + felem_assign(ftmp2, x_in); + /* ftmp2[i] < 2^106 */ + + /* delta = z^2 */ + felem_square(tmp, z_in); + felem_reduce(delta, tmp); + /* delta[i] < 2^101 */ + + /* gamma = y^2 */ + felem_square(tmp, y_in); + felem_reduce(gamma, tmp); + /* gamma[i] < 2^101 */ + felem_shrink(small1, gamma); + + /* beta = x*gamma */ + felem_small_mul(tmp, small1, x_in); + felem_reduce(beta, tmp); + /* beta[i] < 2^101 */ + + /* alpha = 3*(x-delta)*(x+delta) */ + felem_diff(ftmp, delta); + /* ftmp[i] < 2^105 + 2^106 < 2^107 */ + felem_sum(ftmp2, delta); + /* ftmp2[i] < 2^105 + 2^106 < 2^107 */ + felem_scalar(ftmp2, 3); + /* ftmp2[i] < 3 * 2^107 < 2^109 */ + felem_mul(tmp, ftmp, ftmp2); + felem_reduce(alpha, tmp); + /* alpha[i] < 2^101 */ + felem_shrink(small2, alpha); + + /* x' = alpha^2 - 8*beta */ + smallfelem_square(tmp, small2); + felem_reduce(x_out, tmp); + felem_assign(ftmp, beta); + felem_scalar(ftmp, 8); + /* ftmp[i] < 8 * 2^101 = 2^104 */ + felem_diff(x_out, ftmp); + /* x_out[i] < 2^105 + 2^101 < 2^106 */ + + /* z' = (y + z)^2 - gamma - delta */ + felem_sum(delta, gamma); + /* delta[i] < 2^101 + 2^101 = 2^102 */ + felem_assign(ftmp, y_in); + felem_sum(ftmp, z_in); + /* ftmp[i] < 2^106 + 2^106 = 2^107 */ + felem_square(tmp, ftmp); + felem_reduce(z_out, tmp); + felem_diff(z_out, delta); + /* z_out[i] < 2^105 + 2^101 < 2^106 */ + + /* y' = alpha*(4*beta - x') - 8*gamma^2 */ + felem_scalar(beta, 4); + /* beta[i] < 4 * 2^101 = 2^103 */ + felem_diff_zero107(beta, x_out); + /* beta[i] < 2^107 + 2^103 < 2^108 */ + felem_small_mul(tmp, small2, beta); + /* tmp[i] < 7 * 2^64 < 2^67 */ + smallfelem_square(tmp2, small1); + /* tmp2[i] < 7 * 2^64 */ + longfelem_scalar(tmp2, 8); + /* tmp2[i] < 8 * 7 * 2^64 = 7 * 2^67 */ + longfelem_diff(tmp, tmp2); + /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ + felem_reduce_zero105(y_out, tmp); + /* y_out[i] < 2^106 */ + } + +/* point_double_small is the same as point_double, except that it operates on + * smallfelems */ +static void +point_double_small(smallfelem x_out, smallfelem y_out, smallfelem z_out, + const smallfelem x_in, const smallfelem y_in, const smallfelem z_in) + { + felem felem_x_out, felem_y_out, felem_z_out; + felem felem_x_in, felem_y_in, felem_z_in; + + smallfelem_expand(felem_x_in, x_in); + smallfelem_expand(felem_y_in, y_in); + smallfelem_expand(felem_z_in, z_in); + point_double(felem_x_out, felem_y_out, felem_z_out, + felem_x_in, felem_y_in, felem_z_in); + felem_shrink(x_out, felem_x_out); + felem_shrink(y_out, felem_y_out); + felem_shrink(z_out, felem_z_out); + } + +/* copy_conditional copies in to out iff mask is all ones. */ +static void +copy_conditional(felem out, const felem in, limb mask) + { + unsigned i; + for (i = 0; i < NLIMBS; ++i) + { + const limb tmp = mask & (in[i] ^ out[i]); + out[i] ^= tmp; + } + } + +/* copy_small_conditional copies in to out iff mask is all ones. */ +static void +copy_small_conditional(felem out, const smallfelem in, limb mask) + { + unsigned i; + const u64 mask64 = mask; + for (i = 0; i < NLIMBS; ++i) + { + out[i] = ((limb) (in[i] & mask64)) | (out[i] & ~mask); + } + } + +/* point_add calcuates (x1, y1, z1) + (x2, y2, z2) + * + * The method is taken from: + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, + * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). + * + * This function includes a branch for checking whether the two input points + * are equal, (while not equal to the point at infinity). This case never + * happens during single point multiplication, so there is no timing leak for + * ECDH or ECDSA signing. */ +static void point_add(felem x3, felem y3, felem z3, + const felem x1, const felem y1, const felem z1, + const int mixed, const smallfelem x2, const smallfelem y2, const smallfelem z2) + { + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; + longfelem tmp, tmp2; + smallfelem small1, small2, small3, small4, small5; + limb x_equal, y_equal, z1_is_zero, z2_is_zero; + + felem_shrink(small3, z1); + + z1_is_zero = smallfelem_is_zero(small3); + z2_is_zero = smallfelem_is_zero(z2); + + /* ftmp = z1z1 = z1**2 */ + smallfelem_square(tmp, small3); + felem_reduce(ftmp, tmp); + /* ftmp[i] < 2^101 */ + felem_shrink(small1, ftmp); + + if(!mixed) + { + /* ftmp2 = z2z2 = z2**2 */ + smallfelem_square(tmp, z2); + felem_reduce(ftmp2, tmp); + /* ftmp2[i] < 2^101 */ + felem_shrink(small2, ftmp2); + + felem_shrink(small5, x1); + + /* u1 = ftmp3 = x1*z2z2 */ + smallfelem_mul(tmp, small5, small2); + felem_reduce(ftmp3, tmp); + /* ftmp3[i] < 2^101 */ + + /* ftmp5 = z1 + z2 */ + felem_assign(ftmp5, z1); + felem_small_sum(ftmp5, z2); + /* ftmp5[i] < 2^107 */ + + /* ftmp5 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 */ + felem_square(tmp, ftmp5); + felem_reduce(ftmp5, tmp); + /* ftmp2 = z2z2 + z1z1 */ + felem_sum(ftmp2, ftmp); + /* ftmp2[i] < 2^101 + 2^101 = 2^102 */ + felem_diff(ftmp5, ftmp2); + /* ftmp5[i] < 2^105 + 2^101 < 2^106 */ + + /* ftmp2 = z2 * z2z2 */ + smallfelem_mul(tmp, small2, z2); + felem_reduce(ftmp2, tmp); + + /* s1 = ftmp2 = y1 * z2**3 */ + felem_mul(tmp, y1, ftmp2); + felem_reduce(ftmp6, tmp); + /* ftmp6[i] < 2^101 */ + } + else + { + /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ + + /* u1 = ftmp3 = x1*z2z2 */ + felem_assign(ftmp3, x1); + /* ftmp3[i] < 2^106 */ + + /* ftmp5 = 2z1z2 */ + felem_assign(ftmp5, z1); + felem_scalar(ftmp5, 2); + /* ftmp5[i] < 2*2^106 = 2^107 */ + + /* s1 = ftmp2 = y1 * z2**3 */ + felem_assign(ftmp6, y1); + /* ftmp6[i] < 2^106 */ + } + + /* u2 = x2*z1z1 */ + smallfelem_mul(tmp, x2, small1); + felem_reduce(ftmp4, tmp); + + /* h = ftmp4 = u2 - u1 */ + felem_diff_zero107(ftmp4, ftmp3); + /* ftmp4[i] < 2^107 + 2^101 < 2^108 */ + felem_shrink(small4, ftmp4); + + x_equal = smallfelem_is_zero(small4); + + /* z_out = ftmp5 * h */ + felem_small_mul(tmp, small4, ftmp5); + felem_reduce(z_out, tmp); + /* z_out[i] < 2^101 */ + + /* ftmp = z1 * z1z1 */ + smallfelem_mul(tmp, small1, small3); + felem_reduce(ftmp, tmp); + + /* s2 = tmp = y2 * z1**3 */ + felem_small_mul(tmp, y2, ftmp); + felem_reduce(ftmp5, tmp); + + /* r = ftmp5 = (s2 - s1)*2 */ + felem_diff_zero107(ftmp5, ftmp6); + /* ftmp5[i] < 2^107 + 2^107 = 2^108*/ + felem_scalar(ftmp5, 2); + /* ftmp5[i] < 2^109 */ + felem_shrink(small1, ftmp5); + y_equal = smallfelem_is_zero(small1); + + if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) + { + point_double(x3, y3, z3, x1, y1, z1); + return; + } + + /* I = ftmp = (2h)**2 */ + felem_assign(ftmp, ftmp4); + felem_scalar(ftmp, 2); + /* ftmp[i] < 2*2^108 = 2^109 */ + felem_square(tmp, ftmp); + felem_reduce(ftmp, tmp); + + /* J = ftmp2 = h * I */ + felem_mul(tmp, ftmp4, ftmp); + felem_reduce(ftmp2, tmp); + + /* V = ftmp4 = U1 * I */ + felem_mul(tmp, ftmp3, ftmp); + felem_reduce(ftmp4, tmp); + + /* x_out = r**2 - J - 2V */ + smallfelem_square(tmp, small1); + felem_reduce(x_out, tmp); + felem_assign(ftmp3, ftmp4); + felem_scalar(ftmp4, 2); + felem_sum(ftmp4, ftmp2); + /* ftmp4[i] < 2*2^101 + 2^101 < 2^103 */ + felem_diff(x_out, ftmp4); + /* x_out[i] < 2^105 + 2^101 */ + + /* y_out = r(V-x_out) - 2 * s1 * J */ + felem_diff_zero107(ftmp3, x_out); + /* ftmp3[i] < 2^107 + 2^101 < 2^108 */ + felem_small_mul(tmp, small1, ftmp3); + felem_mul(tmp2, ftmp6, ftmp2); + longfelem_scalar(tmp2, 2); + /* tmp2[i] < 2*2^67 = 2^68 */ + longfelem_diff(tmp, tmp2); + /* tmp[i] < 2^67 + 2^70 + 2^40 < 2^71 */ + felem_reduce_zero105(y_out, tmp); + /* y_out[i] < 2^106 */ + + copy_small_conditional(x_out, x2, z1_is_zero); + copy_conditional(x_out, x1, z2_is_zero); + copy_small_conditional(y_out, y2, z1_is_zero); + copy_conditional(y_out, y1, z2_is_zero); + copy_small_conditional(z_out, z2, z1_is_zero); + copy_conditional(z_out, z1, z2_is_zero); + felem_assign(x3, x_out); + felem_assign(y3, y_out); + felem_assign(z3, z_out); + } + +/* point_add_small is the same as point_add, except that it operates on + * smallfelems */ +static void point_add_small(smallfelem x3, smallfelem y3, smallfelem z3, + smallfelem x1, smallfelem y1, smallfelem z1, + smallfelem x2, smallfelem y2, smallfelem z2) + { + felem felem_x3, felem_y3, felem_z3; + felem felem_x1, felem_y1, felem_z1; + smallfelem_expand(felem_x1, x1); + smallfelem_expand(felem_y1, y1); + smallfelem_expand(felem_z1, z1); + point_add(felem_x3, felem_y3, felem_z3, felem_x1, felem_y1, felem_z1, 0, x2, y2, z2); + felem_shrink(x3, felem_x3); + felem_shrink(y3, felem_y3); + felem_shrink(z3, felem_z3); + } + +/* Base point pre computation + * -------------------------- + * + * Two different sorts of precomputed tables are used in the following code. + * Each contain various points on the curve, where each point is three field + * elements (x, y, z). + * + * For the base point table, z is usually 1 (0 for the point at infinity). + * This table has 2 * 16 elements, starting with the following: + * index | bits | point + * ------+---------+------------------------------ + * 0 | 0 0 0 0 | 0G + * 1 | 0 0 0 1 | 1G + * 2 | 0 0 1 0 | 2^64G + * 3 | 0 0 1 1 | (2^64 + 1)G + * 4 | 0 1 0 0 | 2^128G + * 5 | 0 1 0 1 | (2^128 + 1)G + * 6 | 0 1 1 0 | (2^128 + 2^64)G + * 7 | 0 1 1 1 | (2^128 + 2^64 + 1)G + * 8 | 1 0 0 0 | 2^192G + * 9 | 1 0 0 1 | (2^192 + 1)G + * 10 | 1 0 1 0 | (2^192 + 2^64)G + * 11 | 1 0 1 1 | (2^192 + 2^64 + 1)G + * 12 | 1 1 0 0 | (2^192 + 2^128)G + * 13 | 1 1 0 1 | (2^192 + 2^128 + 1)G + * 14 | 1 1 1 0 | (2^192 + 2^128 + 2^64)G + * 15 | 1 1 1 1 | (2^192 + 2^128 + 2^64 + 1)G + * followed by a copy of this with each element multiplied by 2^32. + * + * The reason for this is so that we can clock bits into four different + * locations when doing simple scalar multiplies against the base point, + * and then another four locations using the second 16 elements. + * + * Tables for other points have table[i] = iG for i in 0 .. 16. */ + +/* gmul is the table of precomputed base points */ +static const smallfelem gmul[2][16][3] = +{{{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0xf4a13945d898c296, 0x77037d812deb33a0, 0xf8bce6e563a440f2, 0x6b17d1f2e12c4247}, + {0xcbb6406837bf51f5, 0x2bce33576b315ece, 0x8ee7eb4a7c0f9e16, 0x4fe342e2fe1a7f9b}, + {1, 0, 0, 0}}, + {{0x90e75cb48e14db63, 0x29493baaad651f7e, 0x8492592e326e25de, 0x0fa822bc2811aaa5}, + {0xe41124545f462ee7, 0x34b1a65050fe82f5, 0x6f4ad4bcb3df188b, 0xbff44ae8f5dba80d}, + {1, 0, 0, 0}}, + {{0x93391ce2097992af, 0xe96c98fd0d35f1fa, 0xb257c0de95e02789, 0x300a4bbc89d6726f}, + {0xaa54a291c08127a0, 0x5bb1eeada9d806a5, 0x7f1ddb25ff1e3c6f, 0x72aac7e0d09b4644}, + {1, 0, 0, 0}}, + {{0x57c84fc9d789bd85, 0xfc35ff7dc297eac3, 0xfb982fd588c6766e, 0x447d739beedb5e67}, + {0x0c7e33c972e25b32, 0x3d349b95a7fae500, 0xe12e9d953a4aaff7, 0x2d4825ab834131ee}, + {1, 0, 0, 0}}, + {{0x13949c932a1d367f, 0xef7fbd2b1a0a11b7, 0xddc6068bb91dfc60, 0xef9519328a9c72ff}, + {0x196035a77376d8a8, 0x23183b0895ca1740, 0xc1ee9807022c219c, 0x611e9fc37dbb2c9b}, + {1, 0, 0, 0}}, + {{0xcae2b1920b57f4bc, 0x2936df5ec6c9bc36, 0x7dea6482e11238bf, 0x550663797b51f5d8}, + {0x44ffe216348a964c, 0x9fb3d576dbdefbe1, 0x0afa40018d9d50e5, 0x157164848aecb851}, + {1, 0, 0, 0}}, + {{0xe48ecafffc5cde01, 0x7ccd84e70d715f26, 0xa2e8f483f43e4391, 0xeb5d7745b21141ea}, + {0xcac917e2731a3479, 0x85f22cfe2844b645, 0x0990e6a158006cee, 0xeafd72ebdbecc17b}, + {1, 0, 0, 0}}, + {{0x6cf20ffb313728be, 0x96439591a3c6b94a, 0x2736ff8344315fc5, 0xa6d39677a7849276}, + {0xf2bab833c357f5f4, 0x824a920c2284059b, 0x66b8babd2d27ecdf, 0x674f84749b0b8816}, + {1, 0, 0, 0}}, + {{0x2df48c04677c8a3e, 0x74e02f080203a56b, 0x31855f7db8c7fedb, 0x4e769e7672c9ddad}, + {0xa4c36165b824bbb0, 0xfb9ae16f3b9122a5, 0x1ec0057206947281, 0x42b99082de830663}, + {1, 0, 0, 0}}, + {{0x6ef95150dda868b9, 0xd1f89e799c0ce131, 0x7fdc1ca008a1c478, 0x78878ef61c6ce04d}, + {0x9c62b9121fe0d976, 0x6ace570ebde08d4f, 0xde53142c12309def, 0xb6cb3f5d7b72c321}, + {1, 0, 0, 0}}, + {{0x7f991ed2c31a3573, 0x5b82dd5bd54fb496, 0x595c5220812ffcae, 0x0c88bc4d716b1287}, + {0x3a57bf635f48aca8, 0x7c8181f4df2564f3, 0x18d1b5b39c04e6aa, 0xdd5ddea3f3901dc6}, + {1, 0, 0, 0}}, + {{0xe96a79fb3e72ad0c, 0x43a0a28c42ba792f, 0xefe0a423083e49f3, 0x68f344af6b317466}, + {0xcdfe17db3fb24d4a, 0x668bfc2271f5c626, 0x604ed93c24d67ff3, 0x31b9c405f8540a20}, + {1, 0, 0, 0}}, + {{0xd36b4789a2582e7f, 0x0d1a10144ec39c28, 0x663c62c3edbad7a0, 0x4052bf4b6f461db9}, + {0x235a27c3188d25eb, 0xe724f33999bfcc5b, 0x862be6bd71d70cc8, 0xfecf4d5190b0fc61}, + {1, 0, 0, 0}}, + {{0x74346c10a1d4cfac, 0xafdf5cc08526a7a4, 0x123202a8f62bff7a, 0x1eddbae2c802e41a}, + {0x8fa0af2dd603f844, 0x36e06b7e4c701917, 0x0c45f45273db33a0, 0x43104d86560ebcfc}, + {1, 0, 0, 0}}, + {{0x9615b5110d1d78e5, 0x66b0de3225c4744b, 0x0a4a46fb6aaf363a, 0xb48e26b484f7a21c}, + {0x06ebb0f621a01b2d, 0xc004e4048b7b0f98, 0x64131bcdfed6f668, 0xfac015404d4d3dab}, + {1, 0, 0, 0}}}, + {{{0, 0, 0, 0}, + {0, 0, 0, 0}, + {0, 0, 0, 0}}, + {{0x3a5a9e22185a5943, 0x1ab919365c65dfb6, 0x21656b32262c71da, 0x7fe36b40af22af89}, + {0xd50d152c699ca101, 0x74b3d5867b8af212, 0x9f09f40407dca6f1, 0xe697d45825b63624}, + {1, 0, 0, 0}}, + {{0xa84aa9397512218e, 0xe9a521b074ca0141, 0x57880b3a18a2e902, 0x4a5b506612a677a6}, + {0x0beada7a4c4f3840, 0x626db15419e26d9d, 0xc42604fbe1627d40, 0xeb13461ceac089f1}, + {1, 0, 0, 0}}, + {{0xf9faed0927a43281, 0x5e52c4144103ecbc, 0xc342967aa815c857, 0x0781b8291c6a220a}, + {0x5a8343ceeac55f80, 0x88f80eeee54a05e3, 0x97b2a14f12916434, 0x690cde8df0151593}, + {1, 0, 0, 0}}, + {{0xaee9c75df7f82f2a, 0x9e4c35874afdf43a, 0xf5622df437371326, 0x8a535f566ec73617}, + {0xc5f9a0ac223094b7, 0xcde533864c8c7669, 0x37e02819085a92bf, 0x0455c08468b08bd7}, + {1, 0, 0, 0}}, + {{0x0c0a6e2c9477b5d9, 0xf9a4bf62876dc444, 0x5050a949b6cdc279, 0x06bada7ab77f8276}, + {0xc8b4aed1ea48dac9, 0xdebd8a4b7ea1070f, 0x427d49101366eb70, 0x5b476dfd0e6cb18a}, + {1, 0, 0, 0}}, + {{0x7c5c3e44278c340a, 0x4d54606812d66f3b, 0x29a751b1ae23c5d8, 0x3e29864e8a2ec908}, + {0x142d2a6626dbb850, 0xad1744c4765bd780, 0x1f150e68e322d1ed, 0x239b90ea3dc31e7e}, + {1, 0, 0, 0}}, + {{0x78c416527a53322a, 0x305dde6709776f8e, 0xdbcab759f8862ed4, 0x820f4dd949f72ff7}, + {0x6cc544a62b5debd4, 0x75be5d937b4e8cc4, 0x1b481b1b215c14d3, 0x140406ec783a05ec}, + {1, 0, 0, 0}}, + {{0x6a703f10e895df07, 0xfd75f3fa01876bd8, 0xeb5b06e70ce08ffe, 0x68f6b8542783dfee}, + {0x90c76f8a78712655, 0xcf5293d2f310bf7f, 0xfbc8044dfda45028, 0xcbe1feba92e40ce6}, + {1, 0, 0, 0}}, + {{0xe998ceea4396e4c1, 0xfc82ef0b6acea274, 0x230f729f2250e927, 0xd0b2f94d2f420109}, + {0x4305adddb38d4966, 0x10b838f8624c3b45, 0x7db2636658954e7a, 0x971459828b0719e5}, + {1, 0, 0, 0}}, + {{0x4bd6b72623369fc9, 0x57f2929e53d0b876, 0xc2d5cba4f2340687, 0x961610004a866aba}, + {0x49997bcd2e407a5e, 0x69ab197d92ddcb24, 0x2cf1f2438fe5131c, 0x7acb9fadcee75e44}, + {1, 0, 0, 0}}, + {{0x254e839423d2d4c0, 0xf57f0c917aea685b, 0xa60d880f6f75aaea, 0x24eb9acca333bf5b}, + {0xe3de4ccb1cda5dea, 0xfeef9341c51a6b4f, 0x743125f88bac4c4d, 0x69f891c5acd079cc}, + {1, 0, 0, 0}}, + {{0xeee44b35702476b5, 0x7ed031a0e45c2258, 0xb422d1e7bd6f8514, 0xe51f547c5972a107}, + {0xa25bcd6fc9cf343d, 0x8ca922ee097c184e, 0xa62f98b3a9fe9a06, 0x1c309a2b25bb1387}, + {1, 0, 0, 0}}, + {{0x9295dbeb1967c459, 0xb00148833472c98e, 0xc504977708011828, 0x20b87b8aa2c4e503}, + {0x3063175de057c277, 0x1bd539338fe582dd, 0x0d11adef5f69a044, 0xf5c6fa49919776be}, + {1, 0, 0, 0}}, + {{0x8c944e760fd59e11, 0x3876cba1102fad5f, 0xa454c3fad83faa56, 0x1ed7d1b9332010b9}, + {0xa1011a270024b889, 0x05e4d0dcac0cd344, 0x52b520f0eb6a2a24, 0x3a2b03f03217257a}, + {1, 0, 0, 0}}, + {{0xf20fc2afdf1d043d, 0xf330240db58d5a62, 0xfc7d229ca0058c3b, 0x15fee545c78dd9f6}, + {0x501e82885bc98cda, 0x41ef80e5d046ac04, 0x557d9f49461210fb, 0x4ab5b6b2b8753f81}, + {1, 0, 0, 0}}}}; + +/* select_point selects the |idx|th point from a precomputation table and + * copies it to out. */ +static void select_point(const u64 idx, unsigned int size, const smallfelem pre_comp[16][3], smallfelem out[3]) + { + unsigned i, j; + u64 *outlimbs = &out[0][0]; + memset(outlimbs, 0, 3 * sizeof(smallfelem)); + + for (i = 0; i < size; i++) + { + const u64 *inlimbs = (u64*) &pre_comp[i][0][0]; + u64 mask = i ^ idx; + mask |= mask >> 4; + mask |= mask >> 2; + mask |= mask >> 1; + mask &= 1; + mask--; + for (j = 0; j < NLIMBS * 3; j++) + outlimbs[j] |= inlimbs[j] & mask; + } + } + +/* get_bit returns the |i|th bit in |in| */ +static char get_bit(const felem_bytearray in, int i) + { + if ((i < 0) || (i >= 256)) + return 0; + return (in[i >> 3] >> (i & 7)) & 1; + } + +/* Interleaved point multiplication using precomputed point multiples: + * The small point multiples 0*P, 1*P, ..., 17*P are in pre_comp[], + * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple + * of the generator, using certain (large) precomputed multiples in g_pre_comp. + * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ +static void batch_mul(felem x_out, felem y_out, felem z_out, + const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, + const int mixed, const smallfelem pre_comp[][17][3], const smallfelem g_pre_comp[2][16][3]) + { + int i, skip; + unsigned num, gen_mul = (g_scalar != NULL); + felem nq[3], ftmp; + smallfelem tmp[3]; + u64 bits; + u8 sign, digit; + + /* set nq to the point at infinity */ + memset(nq, 0, 3 * sizeof(felem)); + + /* Loop over all scalars msb-to-lsb, interleaving additions + * of multiples of the generator (two in each of the last 32 rounds) + * and additions of other points multiples (every 5th round). + */ + skip = 1; /* save two point operations in the first round */ + for (i = (num_points ? 255 : 31); i >= 0; --i) + { + /* double */ + if (!skip) + point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + + /* add multiples of the generator */ + if (gen_mul && (i <= 31)) + { + /* first, look 32 bits upwards */ + bits = get_bit(g_scalar, i + 224) << 3; + bits |= get_bit(g_scalar, i + 160) << 2; + bits |= get_bit(g_scalar, i + 96) << 1; + bits |= get_bit(g_scalar, i + 32); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[1], tmp); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + else + { + smallfelem_expand(nq[0], tmp[0]); + smallfelem_expand(nq[1], tmp[1]); + smallfelem_expand(nq[2], tmp[2]); + skip = 0; + } + + /* second, look at the current position */ + bits = get_bit(g_scalar, i + 192) << 3; + bits |= get_bit(g_scalar, i + 128) << 2; + bits |= get_bit(g_scalar, i + 64) << 1; + bits |= get_bit(g_scalar, i); + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp[0], tmp); + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + + /* do other additions every 5 doublings */ + if (num_points && (i % 5 == 0)) + { + /* loop over all scalars */ + for (num = 0; num < num_points; ++num) + { + bits = get_bit(scalars[num], i + 4) << 5; + bits |= get_bit(scalars[num], i + 3) << 4; + bits |= get_bit(scalars[num], i + 2) << 3; + bits |= get_bit(scalars[num], i + 1) << 2; + bits |= get_bit(scalars[num], i) << 1; + bits |= get_bit(scalars[num], i - 1); + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + /* select the point to add or subtract, in constant time */ + select_point(digit, 17, pre_comp[num], tmp); + smallfelem_neg(ftmp, tmp[1]); /* (X, -Y, Z) is the negative point */ + copy_small_conditional(ftmp, tmp[1], (((limb) sign) - 1)); + felem_contract(tmp[1], ftmp); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + mixed, tmp[0], tmp[1], tmp[2]); + } + else + { + smallfelem_expand(nq[0], tmp[0]); + smallfelem_expand(nq[1], tmp[1]); + smallfelem_expand(nq[2], tmp[2]); + skip = 0; + } + } + } + } + felem_assign(x_out, nq[0]); + felem_assign(y_out, nq[1]); + felem_assign(z_out, nq[2]); + } + +/* Precomputation for the group generator. */ +typedef struct { + smallfelem g_pre_comp[2][16][3]; + int references; +} NISTP256_PRE_COMP; + +const EC_METHOD *EC_GFp_nistp256_method(void) + { + static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, + NID_X9_62_prime_field, + ec_GFp_nistp256_group_init, + ec_GFp_simple_group_finish, + ec_GFp_simple_group_clear_finish, + ec_GFp_nist_group_copy, + ec_GFp_nistp256_group_set_curve, + ec_GFp_simple_group_get_curve, + ec_GFp_simple_group_get_degree, + ec_GFp_simple_group_check_discriminant, + ec_GFp_simple_point_init, + ec_GFp_simple_point_finish, + ec_GFp_simple_point_clear_finish, + ec_GFp_simple_point_copy, + ec_GFp_simple_point_set_to_infinity, + ec_GFp_simple_set_Jprojective_coordinates_GFp, + ec_GFp_simple_get_Jprojective_coordinates_GFp, + ec_GFp_simple_point_set_affine_coordinates, + ec_GFp_nistp256_point_get_affine_coordinates, + 0 /* point_set_compressed_coordinates */, + 0 /* point2oct */, + 0 /* oct2point */, + ec_GFp_simple_add, + ec_GFp_simple_dbl, + ec_GFp_simple_invert, + ec_GFp_simple_is_at_infinity, + ec_GFp_simple_is_on_curve, + ec_GFp_simple_cmp, + ec_GFp_simple_make_affine, + ec_GFp_simple_points_make_affine, + ec_GFp_nistp256_points_mul, + ec_GFp_nistp256_precompute_mult, + ec_GFp_nistp256_have_precompute_mult, + ec_GFp_nist_field_mul, + ec_GFp_nist_field_sqr, + 0 /* field_div */, + 0 /* field_encode */, + 0 /* field_decode */, + 0 /* field_set_to_one */ }; + + return &ret; + } + +/******************************************************************************/ +/* FUNCTIONS TO MANAGE PRECOMPUTATION + */ + +static NISTP256_PRE_COMP *nistp256_pre_comp_new() + { + NISTP256_PRE_COMP *ret = NULL; + ret = (NISTP256_PRE_COMP *) OPENSSL_malloc(sizeof *ret); + if (!ret) + { + ECerr(EC_F_NISTP256_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); + return ret; + } + memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); + ret->references = 1; + return ret; + } + +static void *nistp256_pre_comp_dup(void *src_) + { + NISTP256_PRE_COMP *src = src_; + + /* no need to actually copy, these objects never change! */ + CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); + + return src_; + } + +static void nistp256_pre_comp_free(void *pre_) + { + int i; + NISTP256_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_free(pre); + } + +static void nistp256_pre_comp_clear_free(void *pre_) + { + int i; + NISTP256_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_cleanse(pre, sizeof *pre); + OPENSSL_free(pre); + } + +/******************************************************************************/ +/* OPENSSL EC_METHOD FUNCTIONS + */ + +int ec_GFp_nistp256_group_init(EC_GROUP *group) + { + int ret; + ret = ec_GFp_simple_group_init(group); + group->a_is_minus3 = 1; + return ret; + } + +int ec_GFp_nistp256_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) + { + int ret = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *curve_p, *curve_a, *curve_b; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((curve_p = BN_CTX_get(ctx)) == NULL) || + ((curve_a = BN_CTX_get(ctx)) == NULL) || + ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; + BN_bin2bn(nistp256_curve_params[0], sizeof(felem_bytearray), curve_p); + BN_bin2bn(nistp256_curve_params[1], sizeof(felem_bytearray), curve_a); + BN_bin2bn(nistp256_curve_params[2], sizeof(felem_bytearray), curve_b); + if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || + (BN_cmp(curve_b, b))) + { + ECerr(EC_F_EC_GFP_NISTP256_GROUP_SET_CURVE, + EC_R_WRONG_CURVE_PARAMETERS); + goto err; + } + group->field_mod_func = BN_nist_mod_256; + ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); +err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + +/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns + * (X', Y') = (X/Z^2, Y/Z^3) */ +int ec_GFp_nistp256_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) + { + felem z1, z2, x_in, y_in; + smallfelem x_out, y_out; + longfelem tmp; + + if (EC_POINT_is_at_infinity(group, point)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, + EC_R_POINT_AT_INFINITY); + return 0; + } + if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || + (!BN_to_felem(z1, &point->Z))) return 0; + felem_inv(z2, z1); + felem_square(tmp, z2); felem_reduce(z1, tmp); + felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); + felem_contract(x_out, x_in); + if (x != NULL) + { + if (!smallfelem_to_BN(x, x_out)) { + ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); + felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); + felem_contract(y_out, y_in); + if (y != NULL) + { + if (!smallfelem_to_BN(y, y_out)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINT_GET_AFFINE_COORDINATES, + ERR_R_BN_LIB); + return 0; + } + } + return 1; + } + +static void make_points_affine(size_t num, smallfelem points[/* num */][3], smallfelem tmp_smallfelems[/* num+1 */]) + { + /* Runs in constant time, unless an input is the point at infinity + * (which normally shouldn't happen). */ + ec_GFp_nistp_points_make_affine_internal( + num, + points, + sizeof(smallfelem), + tmp_smallfelems, + (void (*)(void *)) smallfelem_one, + (int (*)(const void *)) smallfelem_is_zero_int, + (void (*)(void *, const void *)) smallfelem_assign, + (void (*)(void *, const void *)) smallfelem_square_contract, + (void (*)(void *, const void *, const void *)) smallfelem_mul_contract, + (void (*)(void *, const void *)) smallfelem_inv_contract, + (void (*)(void *, const void *)) smallfelem_assign /* nothing to contract */); + } + +/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values + * Result is stored in r (r can equal one of the inputs). */ +int ec_GFp_nistp256_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx) + { + int ret = 0; + int j; + int mixed = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *z, *tmp_scalar; + felem_bytearray g_secret; + felem_bytearray *secrets = NULL; + smallfelem (*pre_comp)[17][3] = NULL; + smallfelem *tmp_smallfelems = NULL; + felem_bytearray tmp; + unsigned i, num_bytes; + int have_pre_comp = 0; + size_t num_points = num; + smallfelem x_in, y_in, z_in; + felem x_out, y_out, z_out; + NISTP256_PRE_COMP *pre = NULL; + const smallfelem (*g_pre_comp)[16][3] = NULL; + EC_POINT *generator = NULL; + const EC_POINT *p = NULL; + const BIGNUM *p_scalar = NULL; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL) || + ((z = BN_CTX_get(ctx)) == NULL) || + ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) + goto err; + + if (scalar != NULL) + { + pre = EC_EX_DATA_get_data(group->extra_data, + nistp256_pre_comp_dup, nistp256_pre_comp_free, + nistp256_pre_comp_clear_free); + if (pre) + /* we have precomputation, try to use it */ + g_pre_comp = (const smallfelem (*)[16][3]) pre->g_pre_comp; + else + /* try to use the standard precomputation */ + g_pre_comp = &gmul[0]; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + /* get the generator from precomputation */ + if (!smallfelem_to_BN(x, g_pre_comp[0][1][0]) || + !smallfelem_to_BN(y, g_pre_comp[0][1][1]) || + !smallfelem_to_BN(z, g_pre_comp[0][1][2])) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + if (!EC_POINT_set_Jprojective_coordinates_GFp(group, + generator, x, y, z, ctx)) + goto err; + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + /* precomputation matches generator */ + have_pre_comp = 1; + else + /* we don't have valid precomputation: + * treat the generator as a random point */ + num_points++; + } + if (num_points > 0) + { + if (num_points >= 3) + { + /* unless we precompute multiples for just one or two points, + * converting those into affine form is time well spent */ + mixed = 1; + } + secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); + pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(smallfelem)); + if (mixed) + tmp_smallfelems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(smallfelem)); + if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_smallfelems == NULL))) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_MALLOC_FAILURE); + goto err; + } + + /* we treat NULL scalars as 0, and NULL points as points at infinity, + * i.e., they contribute nothing to the linear combination */ + memset(secrets, 0, num_points * sizeof(felem_bytearray)); + memset(pre_comp, 0, num_points * 17 * 3 * sizeof(smallfelem)); + for (i = 0; i < num_points; ++i) + { + if (i == num) + /* we didn't have a valid precomputation, so we pick + * the generator */ + { + p = EC_GROUP_get0_generator(group); + p_scalar = scalar; + } + else + /* the i^th point */ + { + p = points[i]; + p_scalar = scalars[i]; + } + if ((p_scalar != NULL) && (p != NULL)) + { + /* reduce scalar to 0 <= scalar < 2^256 */ + if ((BN_num_bits(p_scalar) > 256) || (BN_is_negative(p_scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(p_scalar, tmp); + flip_endian(secrets[i], tmp, num_bytes); + /* precompute multiples */ + if ((!BN_to_felem(x_out, &p->X)) || + (!BN_to_felem(y_out, &p->Y)) || + (!BN_to_felem(z_out, &p->Z))) goto err; + felem_shrink(pre_comp[i][1][0], x_out); + felem_shrink(pre_comp[i][1][1], y_out); + felem_shrink(pre_comp[i][1][2], z_out); + for (j = 2; j <= 16; ++j) + { + if (j & 1) + { + point_add_small( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], + pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); + } + else + { + point_double_small( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); + } + } + } + } + if (mixed) + make_points_affine(num_points * 17, pre_comp[0], tmp_smallfelems); + } + + /* the scalar for the generator */ + if ((scalar != NULL) && (have_pre_comp)) + { + memset(g_secret, 0, sizeof(g_secret)); + /* reduce scalar to 0 <= scalar < 2^256 */ + if ((BN_num_bits(scalar) > 256) || (BN_is_negative(scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(scalar, tmp); + flip_endian(g_secret, tmp, num_bytes); + /* do the multiplication with generator precomputation*/ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + g_secret, + mixed, (const smallfelem (*)[17][3]) pre_comp, + g_pre_comp); + } + else + /* do the multiplication without generator precomputation */ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + NULL, mixed, (const smallfelem (*)[17][3]) pre_comp, NULL); + /* reduce the output to its unique minimal representation */ + felem_contract(x_in, x_out); + felem_contract(y_in, y_out); + felem_contract(z_in, z_out); + if ((!smallfelem_to_BN(x, x_in)) || (!smallfelem_to_BN(y, y_in)) || + (!smallfelem_to_BN(z, z_in))) + { + ECerr(EC_F_EC_GFP_NISTP256_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); + +err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (secrets != NULL) + OPENSSL_free(secrets); + if (pre_comp != NULL) + OPENSSL_free(pre_comp); + if (tmp_smallfelems != NULL) + OPENSSL_free(tmp_smallfelems); + return ret; + } + +int ec_GFp_nistp256_precompute_mult(EC_GROUP *group, BN_CTX *ctx) + { + int ret = 0; + NISTP256_PRE_COMP *pre = NULL; + int i, j; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + EC_POINT *generator = NULL; + smallfelem tmp_smallfelems[32]; + felem x_tmp, y_tmp, z_tmp; + + /* throw away old precomputation */ + EC_EX_DATA_free_data(&group->extra_data, nistp256_pre_comp_dup, + nistp256_pre_comp_free, nistp256_pre_comp_clear_free); + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL)) + goto err; + /* get the generator */ + if (group->generator == NULL) goto err; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + BN_bin2bn(nistp256_curve_params[3], sizeof (felem_bytearray), x); + BN_bin2bn(nistp256_curve_params[4], sizeof (felem_bytearray), y); + if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) + goto err; + if ((pre = nistp256_pre_comp_new()) == NULL) + goto err; + /* if the generator is the standard one, use built-in precomputation */ + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + { + memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); + ret = 1; + goto err; + } + if ((!BN_to_felem(x_tmp, &group->generator->X)) || + (!BN_to_felem(y_tmp, &group->generator->Y)) || + (!BN_to_felem(z_tmp, &group->generator->Z))) + goto err; + felem_shrink(pre->g_pre_comp[0][1][0], x_tmp); + felem_shrink(pre->g_pre_comp[0][1][1], y_tmp); + felem_shrink(pre->g_pre_comp[0][1][2], z_tmp); + /* compute 2^64*G, 2^128*G, 2^192*G for the first table, + * 2^32*G, 2^96*G, 2^160*G, 2^224*G for the second one + */ + for (i = 1; i <= 8; i <<= 1) + { + point_double_small( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[0][i][0], pre->g_pre_comp[0][i][1], pre->g_pre_comp[0][i][2]); + for (j = 0; j < 31; ++j) + { + point_double_small( + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + } + if (i == 8) + break; + point_double_small( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[1][i][0], pre->g_pre_comp[1][i][1], pre->g_pre_comp[1][i][2]); + for (j = 0; j < 31; ++j) + { + point_double_small( + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2], + pre->g_pre_comp[0][2*i][0], pre->g_pre_comp[0][2*i][1], pre->g_pre_comp[0][2*i][2]); + } + } + for (i = 0; i < 2; i++) + { + /* g_pre_comp[i][0] is the point at infinity */ + memset(pre->g_pre_comp[i][0], 0, sizeof(pre->g_pre_comp[i][0])); + /* the remaining multiples */ + /* 2^64*G + 2^128*G resp. 2^96*G + 2^160*G */ + point_add_small( + pre->g_pre_comp[i][6][0], pre->g_pre_comp[i][6][1], pre->g_pre_comp[i][6][2], + pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2], + pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); + /* 2^64*G + 2^192*G resp. 2^96*G + 2^224*G */ + point_add_small( + pre->g_pre_comp[i][10][0], pre->g_pre_comp[i][10][1], pre->g_pre_comp[i][10][2], + pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); + /* 2^128*G + 2^192*G resp. 2^160*G + 2^224*G */ + point_add_small( + pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], + pre->g_pre_comp[i][8][0], pre->g_pre_comp[i][8][1], pre->g_pre_comp[i][8][2], + pre->g_pre_comp[i][4][0], pre->g_pre_comp[i][4][1], pre->g_pre_comp[i][4][2]); + /* 2^64*G + 2^128*G + 2^192*G resp. 2^96*G + 2^160*G + 2^224*G */ + point_add_small( + pre->g_pre_comp[i][14][0], pre->g_pre_comp[i][14][1], pre->g_pre_comp[i][14][2], + pre->g_pre_comp[i][12][0], pre->g_pre_comp[i][12][1], pre->g_pre_comp[i][12][2], + pre->g_pre_comp[i][2][0], pre->g_pre_comp[i][2][1], pre->g_pre_comp[i][2][2]); + for (j = 1; j < 8; ++j) + { + /* odd multiples: add G resp. 2^32*G */ + point_add_small( + pre->g_pre_comp[i][2*j+1][0], pre->g_pre_comp[i][2*j+1][1], pre->g_pre_comp[i][2*j+1][2], + pre->g_pre_comp[i][2*j][0], pre->g_pre_comp[i][2*j][1], pre->g_pre_comp[i][2*j][2], + pre->g_pre_comp[i][1][0], pre->g_pre_comp[i][1][1], pre->g_pre_comp[i][1][2]); + } + } + make_points_affine(31, &(pre->g_pre_comp[0][1]), tmp_smallfelems); + + if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp256_pre_comp_dup, + nistp256_pre_comp_free, nistp256_pre_comp_clear_free)) + goto err; + ret = 1; + pre = NULL; + err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (pre) + nistp256_pre_comp_free(pre); + return ret; + } + +int ec_GFp_nistp256_have_precompute_mult(const EC_GROUP *group) + { + if (EC_EX_DATA_get_data(group->extra_data, nistp256_pre_comp_dup, + nistp256_pre_comp_free, nistp256_pre_comp_clear_free) + != NULL) + return 1; + else + return 0; + } +#else +static void *dummy=&dummy; +#endif diff --git a/src/lib/libcrypto/ec/ecp_nistp521.c b/src/lib/libcrypto/ec/ecp_nistp521.c new file mode 100644 index 0000000000..178b655f7f --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistp521.c @@ -0,0 +1,2025 @@ +/* crypto/ec/ecp_nistp521.c */ +/* + * Written by Adam Langley (Google) for the OpenSSL project + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication + * + * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c. + * Otherwise based on Emilia's P224 work, which was inspired by my curve25519 + * work which got its smarts from Daniel J. Bernstein's work on the same. + */ + +#include +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +#ifndef OPENSSL_SYS_VMS +#include +#else +#include +#endif + +#include +#include +#include "ec_lcl.h" + +#if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1)) + /* even with gcc, the typedef won't work for 32-bit platforms */ + typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */ +#else + #error "Need GCC 3.1 or later to define type uint128_t" +#endif + +typedef uint8_t u8; +typedef uint64_t u64; +typedef int64_t s64; + +/* The underlying field. + * + * P521 operates over GF(2^521-1). We can serialise an element of this field + * into 66 bytes where the most significant byte contains only a single bit. We + * call this an felem_bytearray. */ + +typedef u8 felem_bytearray[66]; + +/* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5. + * These values are big-endian. */ +static const felem_bytearray nistp521_curve_params[5] = + { + {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff}, + {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */ + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xfc}, + {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */ + 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85, + 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3, + 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1, + 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e, + 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1, + 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c, + 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50, + 0x3f, 0x00}, + {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */ + 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, + 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f, + 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d, + 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7, + 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, + 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, + 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5, + 0xbd, 0x66}, + {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */ + 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d, + 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, + 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, + 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4, + 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad, + 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, + 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1, + 0x66, 0x50} + }; + +/* The representation of field elements. + * ------------------------------------ + * + * We represent field elements with nine values. These values are either 64 or + * 128 bits and the field element represented is: + * v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464 (mod p) + * Each of the nine values is called a 'limb'. Since the limbs are spaced only + * 58 bits apart, but are greater than 58 bits in length, the most significant + * bits of each limb overlap with the least significant bits of the next. + * + * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a + * 'largefelem' */ + +#define NLIMBS 9 + +typedef uint64_t limb; +typedef limb felem[NLIMBS]; +typedef uint128_t largefelem[NLIMBS]; + +static const limb bottom57bits = 0x1ffffffffffffff; +static const limb bottom58bits = 0x3ffffffffffffff; + +/* bin66_to_felem takes a little-endian byte array and converts it into felem + * form. This assumes that the CPU is little-endian. */ +static void bin66_to_felem(felem out, const u8 in[66]) + { + out[0] = (*((limb*) &in[0])) & bottom58bits; + out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits; + out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits; + out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits; + out[4] = (*((limb*) &in[29])) & bottom58bits; + out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits; + out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits; + out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits; + out[8] = (*((limb*) &in[58])) & bottom57bits; + } + +/* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte + * array. This assumes that the CPU is little-endian. */ +static void felem_to_bin66(u8 out[66], const felem in) + { + memset(out, 0, 66); + (*((limb*) &out[0])) = in[0]; + (*((limb*) &out[7])) |= in[1] << 2; + (*((limb*) &out[14])) |= in[2] << 4; + (*((limb*) &out[21])) |= in[3] << 6; + (*((limb*) &out[29])) = in[4]; + (*((limb*) &out[36])) |= in[5] << 2; + (*((limb*) &out[43])) |= in[6] << 4; + (*((limb*) &out[50])) |= in[7] << 6; + (*((limb*) &out[58])) = in[8]; + } + +/* To preserve endianness when using BN_bn2bin and BN_bin2bn */ +static void flip_endian(u8 *out, const u8 *in, unsigned len) + { + unsigned i; + for (i = 0; i < len; ++i) + out[i] = in[len-1-i]; + } + +/* BN_to_felem converts an OpenSSL BIGNUM into an felem */ +static int BN_to_felem(felem out, const BIGNUM *bn) + { + felem_bytearray b_in; + felem_bytearray b_out; + unsigned num_bytes; + + /* BN_bn2bin eats leading zeroes */ + memset(b_out, 0, sizeof b_out); + num_bytes = BN_num_bytes(bn); + if (num_bytes > sizeof b_out) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + if (BN_is_negative(bn)) + { + ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE); + return 0; + } + num_bytes = BN_bn2bin(bn, b_in); + flip_endian(b_out, b_in, num_bytes); + bin66_to_felem(out, b_out); + return 1; + } + +/* felem_to_BN converts an felem into an OpenSSL BIGNUM */ +static BIGNUM *felem_to_BN(BIGNUM *out, const felem in) + { + felem_bytearray b_in, b_out; + felem_to_bin66(b_in, in); + flip_endian(b_out, b_in, sizeof b_out); + return BN_bin2bn(b_out, sizeof b_out, out); + } + + +/* Field operations + * ---------------- */ + +static void felem_one(felem out) + { + out[0] = 1; + out[1] = 0; + out[2] = 0; + out[3] = 0; + out[4] = 0; + out[5] = 0; + out[6] = 0; + out[7] = 0; + out[8] = 0; + } + +static void felem_assign(felem out, const felem in) + { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; + out[8] = in[8]; + } + +/* felem_sum64 sets out = out + in. */ +static void felem_sum64(felem out, const felem in) + { + out[0] += in[0]; + out[1] += in[1]; + out[2] += in[2]; + out[3] += in[3]; + out[4] += in[4]; + out[5] += in[5]; + out[6] += in[6]; + out[7] += in[7]; + out[8] += in[8]; + } + +/* felem_scalar sets out = in * scalar */ +static void felem_scalar(felem out, const felem in, limb scalar) + { + out[0] = in[0] * scalar; + out[1] = in[1] * scalar; + out[2] = in[2] * scalar; + out[3] = in[3] * scalar; + out[4] = in[4] * scalar; + out[5] = in[5] * scalar; + out[6] = in[6] * scalar; + out[7] = in[7] * scalar; + out[8] = in[8] * scalar; + } + +/* felem_scalar64 sets out = out * scalar */ +static void felem_scalar64(felem out, limb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + out[7] *= scalar; + out[8] *= scalar; + } + +/* felem_scalar128 sets out = out * scalar */ +static void felem_scalar128(largefelem out, limb scalar) + { + out[0] *= scalar; + out[1] *= scalar; + out[2] *= scalar; + out[3] *= scalar; + out[4] *= scalar; + out[5] *= scalar; + out[6] *= scalar; + out[7] *= scalar; + out[8] *= scalar; + } + +/* felem_neg sets |out| to |-in| + * On entry: + * in[i] < 2^59 + 2^14 + * On exit: + * out[i] < 2^62 + */ +static void felem_neg(felem out, const felem in) + { + /* In order to prevent underflow, we subtract from 0 mod p. */ + static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); + static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); + + out[0] = two62m3 - in[0]; + out[1] = two62m2 - in[1]; + out[2] = two62m2 - in[2]; + out[3] = two62m2 - in[3]; + out[4] = two62m2 - in[4]; + out[5] = two62m2 - in[5]; + out[6] = two62m2 - in[6]; + out[7] = two62m2 - in[7]; + out[8] = two62m2 - in[8]; + } + +/* felem_diff64 subtracts |in| from |out| + * On entry: + * in[i] < 2^59 + 2^14 + * On exit: + * out[i] < out[i] + 2^62 + */ +static void felem_diff64(felem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5); + static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4); + + out[0] += two62m3 - in[0]; + out[1] += two62m2 - in[1]; + out[2] += two62m2 - in[2]; + out[3] += two62m2 - in[3]; + out[4] += two62m2 - in[4]; + out[5] += two62m2 - in[5]; + out[6] += two62m2 - in[6]; + out[7] += two62m2 - in[7]; + out[8] += two62m2 - in[8]; + } + +/* felem_diff_128_64 subtracts |in| from |out| + * On entry: + * in[i] < 2^62 + 2^17 + * On exit: + * out[i] < out[i] + 2^63 + */ +static void felem_diff_128_64(largefelem out, const felem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5); + static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4); + + out[0] += two63m6 - in[0]; + out[1] += two63m5 - in[1]; + out[2] += two63m5 - in[2]; + out[3] += two63m5 - in[3]; + out[4] += two63m5 - in[4]; + out[5] += two63m5 - in[5]; + out[6] += two63m5 - in[6]; + out[7] += two63m5 - in[7]; + out[8] += two63m5 - in[8]; + } + +/* felem_diff_128_64 subtracts |in| from |out| + * On entry: + * in[i] < 2^126 + * On exit: + * out[i] < out[i] + 2^127 - 2^69 + */ +static void felem_diff128(largefelem out, const largefelem in) + { + /* In order to prevent underflow, we add 0 mod p before subtracting. */ + static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70); + static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69); + + out[0] += (two127m70 - in[0]); + out[1] += (two127m69 - in[1]); + out[2] += (two127m69 - in[2]); + out[3] += (two127m69 - in[3]); + out[4] += (two127m69 - in[4]); + out[5] += (two127m69 - in[5]); + out[6] += (two127m69 - in[6]); + out[7] += (two127m69 - in[7]); + out[8] += (two127m69 - in[8]); + } + +/* felem_square sets |out| = |in|^2 + * On entry: + * in[i] < 2^62 + * On exit: + * out[i] < 17 * max(in[i]) * max(in[i]) + */ +static void felem_square(largefelem out, const felem in) + { + felem inx2, inx4; + felem_scalar(inx2, in, 2); + felem_scalar(inx4, in, 4); + + /* We have many cases were we want to do + * in[x] * in[y] + + * in[y] * in[x] + * This is obviously just + * 2 * in[x] * in[y] + * However, rather than do the doubling on the 128 bit result, we + * double one of the inputs to the multiplication by reading from + * |inx2| */ + + out[0] = ((uint128_t) in[0]) * in[0]; + out[1] = ((uint128_t) in[0]) * inx2[1]; + out[2] = ((uint128_t) in[0]) * inx2[2] + + ((uint128_t) in[1]) * in[1]; + out[3] = ((uint128_t) in[0]) * inx2[3] + + ((uint128_t) in[1]) * inx2[2]; + out[4] = ((uint128_t) in[0]) * inx2[4] + + ((uint128_t) in[1]) * inx2[3] + + ((uint128_t) in[2]) * in[2]; + out[5] = ((uint128_t) in[0]) * inx2[5] + + ((uint128_t) in[1]) * inx2[4] + + ((uint128_t) in[2]) * inx2[3]; + out[6] = ((uint128_t) in[0]) * inx2[6] + + ((uint128_t) in[1]) * inx2[5] + + ((uint128_t) in[2]) * inx2[4] + + ((uint128_t) in[3]) * in[3]; + out[7] = ((uint128_t) in[0]) * inx2[7] + + ((uint128_t) in[1]) * inx2[6] + + ((uint128_t) in[2]) * inx2[5] + + ((uint128_t) in[3]) * inx2[4]; + out[8] = ((uint128_t) in[0]) * inx2[8] + + ((uint128_t) in[1]) * inx2[7] + + ((uint128_t) in[2]) * inx2[6] + + ((uint128_t) in[3]) * inx2[5] + + ((uint128_t) in[4]) * in[4]; + + /* The remaining limbs fall above 2^521, with the first falling at + * 2^522. They correspond to locations one bit up from the limbs + * produced above so we would have to multiply by two to align them. + * Again, rather than operate on the 128-bit result, we double one of + * the inputs to the multiplication. If we want to double for both this + * reason, and the reason above, then we end up multiplying by four. */ + + /* 9 */ + out[0] += ((uint128_t) in[1]) * inx4[8] + + ((uint128_t) in[2]) * inx4[7] + + ((uint128_t) in[3]) * inx4[6] + + ((uint128_t) in[4]) * inx4[5]; + + /* 10 */ + out[1] += ((uint128_t) in[2]) * inx4[8] + + ((uint128_t) in[3]) * inx4[7] + + ((uint128_t) in[4]) * inx4[6] + + ((uint128_t) in[5]) * inx2[5]; + + /* 11 */ + out[2] += ((uint128_t) in[3]) * inx4[8] + + ((uint128_t) in[4]) * inx4[7] + + ((uint128_t) in[5]) * inx4[6]; + + /* 12 */ + out[3] += ((uint128_t) in[4]) * inx4[8] + + ((uint128_t) in[5]) * inx4[7] + + ((uint128_t) in[6]) * inx2[6]; + + /* 13 */ + out[4] += ((uint128_t) in[5]) * inx4[8] + + ((uint128_t) in[6]) * inx4[7]; + + /* 14 */ + out[5] += ((uint128_t) in[6]) * inx4[8] + + ((uint128_t) in[7]) * inx2[7]; + + /* 15 */ + out[6] += ((uint128_t) in[7]) * inx4[8]; + + /* 16 */ + out[7] += ((uint128_t) in[8]) * inx2[8]; + } + +/* felem_mul sets |out| = |in1| * |in2| + * On entry: + * in1[i] < 2^64 + * in2[i] < 2^63 + * On exit: + * out[i] < 17 * max(in1[i]) * max(in2[i]) + */ +static void felem_mul(largefelem out, const felem in1, const felem in2) + { + felem in2x2; + felem_scalar(in2x2, in2, 2); + + out[0] = ((uint128_t) in1[0]) * in2[0]; + + out[1] = ((uint128_t) in1[0]) * in2[1] + + ((uint128_t) in1[1]) * in2[0]; + + out[2] = ((uint128_t) in1[0]) * in2[2] + + ((uint128_t) in1[1]) * in2[1] + + ((uint128_t) in1[2]) * in2[0]; + + out[3] = ((uint128_t) in1[0]) * in2[3] + + ((uint128_t) in1[1]) * in2[2] + + ((uint128_t) in1[2]) * in2[1] + + ((uint128_t) in1[3]) * in2[0]; + + out[4] = ((uint128_t) in1[0]) * in2[4] + + ((uint128_t) in1[1]) * in2[3] + + ((uint128_t) in1[2]) * in2[2] + + ((uint128_t) in1[3]) * in2[1] + + ((uint128_t) in1[4]) * in2[0]; + + out[5] = ((uint128_t) in1[0]) * in2[5] + + ((uint128_t) in1[1]) * in2[4] + + ((uint128_t) in1[2]) * in2[3] + + ((uint128_t) in1[3]) * in2[2] + + ((uint128_t) in1[4]) * in2[1] + + ((uint128_t) in1[5]) * in2[0]; + + out[6] = ((uint128_t) in1[0]) * in2[6] + + ((uint128_t) in1[1]) * in2[5] + + ((uint128_t) in1[2]) * in2[4] + + ((uint128_t) in1[3]) * in2[3] + + ((uint128_t) in1[4]) * in2[2] + + ((uint128_t) in1[5]) * in2[1] + + ((uint128_t) in1[6]) * in2[0]; + + out[7] = ((uint128_t) in1[0]) * in2[7] + + ((uint128_t) in1[1]) * in2[6] + + ((uint128_t) in1[2]) * in2[5] + + ((uint128_t) in1[3]) * in2[4] + + ((uint128_t) in1[4]) * in2[3] + + ((uint128_t) in1[5]) * in2[2] + + ((uint128_t) in1[6]) * in2[1] + + ((uint128_t) in1[7]) * in2[0]; + + out[8] = ((uint128_t) in1[0]) * in2[8] + + ((uint128_t) in1[1]) * in2[7] + + ((uint128_t) in1[2]) * in2[6] + + ((uint128_t) in1[3]) * in2[5] + + ((uint128_t) in1[4]) * in2[4] + + ((uint128_t) in1[5]) * in2[3] + + ((uint128_t) in1[6]) * in2[2] + + ((uint128_t) in1[7]) * in2[1] + + ((uint128_t) in1[8]) * in2[0]; + + /* See comment in felem_square about the use of in2x2 here */ + + out[0] += ((uint128_t) in1[1]) * in2x2[8] + + ((uint128_t) in1[2]) * in2x2[7] + + ((uint128_t) in1[3]) * in2x2[6] + + ((uint128_t) in1[4]) * in2x2[5] + + ((uint128_t) in1[5]) * in2x2[4] + + ((uint128_t) in1[6]) * in2x2[3] + + ((uint128_t) in1[7]) * in2x2[2] + + ((uint128_t) in1[8]) * in2x2[1]; + + out[1] += ((uint128_t) in1[2]) * in2x2[8] + + ((uint128_t) in1[3]) * in2x2[7] + + ((uint128_t) in1[4]) * in2x2[6] + + ((uint128_t) in1[5]) * in2x2[5] + + ((uint128_t) in1[6]) * in2x2[4] + + ((uint128_t) in1[7]) * in2x2[3] + + ((uint128_t) in1[8]) * in2x2[2]; + + out[2] += ((uint128_t) in1[3]) * in2x2[8] + + ((uint128_t) in1[4]) * in2x2[7] + + ((uint128_t) in1[5]) * in2x2[6] + + ((uint128_t) in1[6]) * in2x2[5] + + ((uint128_t) in1[7]) * in2x2[4] + + ((uint128_t) in1[8]) * in2x2[3]; + + out[3] += ((uint128_t) in1[4]) * in2x2[8] + + ((uint128_t) in1[5]) * in2x2[7] + + ((uint128_t) in1[6]) * in2x2[6] + + ((uint128_t) in1[7]) * in2x2[5] + + ((uint128_t) in1[8]) * in2x2[4]; + + out[4] += ((uint128_t) in1[5]) * in2x2[8] + + ((uint128_t) in1[6]) * in2x2[7] + + ((uint128_t) in1[7]) * in2x2[6] + + ((uint128_t) in1[8]) * in2x2[5]; + + out[5] += ((uint128_t) in1[6]) * in2x2[8] + + ((uint128_t) in1[7]) * in2x2[7] + + ((uint128_t) in1[8]) * in2x2[6]; + + out[6] += ((uint128_t) in1[7]) * in2x2[8] + + ((uint128_t) in1[8]) * in2x2[7]; + + out[7] += ((uint128_t) in1[8]) * in2x2[8]; + } + +static const limb bottom52bits = 0xfffffffffffff; + +/* felem_reduce converts a largefelem to an felem. + * On entry: + * in[i] < 2^128 + * On exit: + * out[i] < 2^59 + 2^14 + */ +static void felem_reduce(felem out, const largefelem in) + { + u64 overflow1, overflow2; + + out[0] = ((limb) in[0]) & bottom58bits; + out[1] = ((limb) in[1]) & bottom58bits; + out[2] = ((limb) in[2]) & bottom58bits; + out[3] = ((limb) in[3]) & bottom58bits; + out[4] = ((limb) in[4]) & bottom58bits; + out[5] = ((limb) in[5]) & bottom58bits; + out[6] = ((limb) in[6]) & bottom58bits; + out[7] = ((limb) in[7]) & bottom58bits; + out[8] = ((limb) in[8]) & bottom58bits; + + /* out[i] < 2^58 */ + + out[1] += ((limb) in[0]) >> 58; + out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6; + /* out[1] < 2^58 + 2^6 + 2^58 + * = 2^59 + 2^6 */ + out[2] += ((limb) (in[0] >> 64)) >> 52; + + out[2] += ((limb) in[1]) >> 58; + out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6; + out[3] += ((limb) (in[1] >> 64)) >> 52; + + out[3] += ((limb) in[2]) >> 58; + out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6; + out[4] += ((limb) (in[2] >> 64)) >> 52; + + out[4] += ((limb) in[3]) >> 58; + out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6; + out[5] += ((limb) (in[3] >> 64)) >> 52; + + out[5] += ((limb) in[4]) >> 58; + out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6; + out[6] += ((limb) (in[4] >> 64)) >> 52; + + out[6] += ((limb) in[5]) >> 58; + out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6; + out[7] += ((limb) (in[5] >> 64)) >> 52; + + out[7] += ((limb) in[6]) >> 58; + out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6; + out[8] += ((limb) (in[6] >> 64)) >> 52; + + out[8] += ((limb) in[7]) >> 58; + out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6; + /* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12 + * < 2^59 + 2^13 */ + overflow1 = ((limb) (in[7] >> 64)) >> 52; + + overflow1 += ((limb) in[8]) >> 58; + overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6; + overflow2 = ((limb) (in[8] >> 64)) >> 52; + + overflow1 <<= 1; /* overflow1 < 2^13 + 2^7 + 2^59 */ + overflow2 <<= 1; /* overflow2 < 2^13 */ + + out[0] += overflow1; /* out[0] < 2^60 */ + out[1] += overflow2; /* out[1] < 2^59 + 2^6 + 2^13 */ + + out[1] += out[0] >> 58; out[0] &= bottom58bits; + /* out[0] < 2^58 + * out[1] < 2^59 + 2^6 + 2^13 + 2^2 + * < 2^59 + 2^14 */ + } + +static void felem_square_reduce(felem out, const felem in) + { + largefelem tmp; + felem_square(tmp, in); + felem_reduce(out, tmp); + } + +static void felem_mul_reduce(felem out, const felem in1, const felem in2) + { + largefelem tmp; + felem_mul(tmp, in1, in2); + felem_reduce(out, tmp); + } + +/* felem_inv calculates |out| = |in|^{-1} + * + * Based on Fermat's Little Theorem: + * a^p = a (mod p) + * a^{p-1} = 1 (mod p) + * a^{p-2} = a^{-1} (mod p) + */ +static void felem_inv(felem out, const felem in) + { + felem ftmp, ftmp2, ftmp3, ftmp4; + largefelem tmp; + unsigned i; + + felem_square(tmp, in); felem_reduce(ftmp, tmp); /* 2^1 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^2 - 2^0 */ + felem_assign(ftmp2, ftmp); + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^1 */ + felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp); /* 2^3 - 2^0 */ + felem_square(tmp, ftmp); felem_reduce(ftmp, tmp); /* 2^4 - 2^1 */ + + felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp); /* 2^3 - 2^1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^4 - 2^2 */ + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^4 - 2^0 */ + + felem_assign(ftmp2, ftmp3); + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^5 - 2^1 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^6 - 2^2 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^7 - 2^3 */ + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^8 - 2^4 */ + felem_assign(ftmp4, ftmp3); + felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp); /* 2^8 - 2^1 */ + felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp); /* 2^9 - 2^2 */ + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^8 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 8; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^16 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 16; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^32 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 32; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^64 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 64; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^128 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 128; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^256 - 2^0 */ + felem_assign(ftmp2, ftmp3); + + for (i = 0; i < 256; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */ + } + felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp); /* 2^512 - 2^0 */ + + for (i = 0; i < 9; i++) + { + felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */ + } + felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp); /* 2^512 - 2^2 */ + felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp); /* 2^512 - 3 */ +} + +/* This is 2^521-1, expressed as an felem */ +static const felem kPrime = + { + 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, + 0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff, + 0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff + }; + +/* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0 + * otherwise. + * On entry: + * in[i] < 2^59 + 2^14 + */ +static limb felem_is_zero(const felem in) + { + felem ftmp; + limb is_zero, is_p; + felem_assign(ftmp, in); + + ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits; + /* ftmp[8] < 2^57 */ + ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits; + ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits; + ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits; + ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits; + ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits; + ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits; + ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits; + ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits; + /* ftmp[8] < 2^57 + 4 */ + + /* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is + * greater than our bound for ftmp[8]. Therefore we only have to check + * if the zero is zero or 2^521-1. */ + + is_zero = 0; + is_zero |= ftmp[0]; + is_zero |= ftmp[1]; + is_zero |= ftmp[2]; + is_zero |= ftmp[3]; + is_zero |= ftmp[4]; + is_zero |= ftmp[5]; + is_zero |= ftmp[6]; + is_zero |= ftmp[7]; + is_zero |= ftmp[8]; + + is_zero--; + /* We know that ftmp[i] < 2^63, therefore the only way that the top bit + * can be set is if is_zero was 0 before the decrement. */ + is_zero = ((s64) is_zero) >> 63; + + is_p = ftmp[0] ^ kPrime[0]; + is_p |= ftmp[1] ^ kPrime[1]; + is_p |= ftmp[2] ^ kPrime[2]; + is_p |= ftmp[3] ^ kPrime[3]; + is_p |= ftmp[4] ^ kPrime[4]; + is_p |= ftmp[5] ^ kPrime[5]; + is_p |= ftmp[6] ^ kPrime[6]; + is_p |= ftmp[7] ^ kPrime[7]; + is_p |= ftmp[8] ^ kPrime[8]; + + is_p--; + is_p = ((s64) is_p) >> 63; + + is_zero |= is_p; + return is_zero; + } + +static int felem_is_zero_int(const felem in) + { + return (int) (felem_is_zero(in) & ((limb)1)); + } + +/* felem_contract converts |in| to its unique, minimal representation. + * On entry: + * in[i] < 2^59 + 2^14 + */ +static void felem_contract(felem out, const felem in) + { + limb is_p, is_greater, sign; + static const limb two58 = ((limb)1) << 58; + + felem_assign(out, in); + + out[0] += out[8] >> 57; out[8] &= bottom57bits; + /* out[8] < 2^57 */ + out[1] += out[0] >> 58; out[0] &= bottom58bits; + out[2] += out[1] >> 58; out[1] &= bottom58bits; + out[3] += out[2] >> 58; out[2] &= bottom58bits; + out[4] += out[3] >> 58; out[3] &= bottom58bits; + out[5] += out[4] >> 58; out[4] &= bottom58bits; + out[6] += out[5] >> 58; out[5] &= bottom58bits; + out[7] += out[6] >> 58; out[6] &= bottom58bits; + out[8] += out[7] >> 58; out[7] &= bottom58bits; + /* out[8] < 2^57 + 4 */ + + /* If the value is greater than 2^521-1 then we have to subtract + * 2^521-1 out. See the comments in felem_is_zero regarding why we + * don't test for other multiples of the prime. */ + + /* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */ + + is_p = out[0] ^ kPrime[0]; + is_p |= out[1] ^ kPrime[1]; + is_p |= out[2] ^ kPrime[2]; + is_p |= out[3] ^ kPrime[3]; + is_p |= out[4] ^ kPrime[4]; + is_p |= out[5] ^ kPrime[5]; + is_p |= out[6] ^ kPrime[6]; + is_p |= out[7] ^ kPrime[7]; + is_p |= out[8] ^ kPrime[8]; + + is_p--; + is_p &= is_p << 32; + is_p &= is_p << 16; + is_p &= is_p << 8; + is_p &= is_p << 4; + is_p &= is_p << 2; + is_p &= is_p << 1; + is_p = ((s64) is_p) >> 63; + is_p = ~is_p; + + /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */ + + out[0] &= is_p; + out[1] &= is_p; + out[2] &= is_p; + out[3] &= is_p; + out[4] &= is_p; + out[5] &= is_p; + out[6] &= is_p; + out[7] &= is_p; + out[8] &= is_p; + + /* In order to test that |out| >= 2^521-1 we need only test if out[8] + * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */ + is_greater = out[8] >> 57; + is_greater |= is_greater << 32; + is_greater |= is_greater << 16; + is_greater |= is_greater << 8; + is_greater |= is_greater << 4; + is_greater |= is_greater << 2; + is_greater |= is_greater << 1; + is_greater = ((s64) is_greater) >> 63; + + out[0] -= kPrime[0] & is_greater; + out[1] -= kPrime[1] & is_greater; + out[2] -= kPrime[2] & is_greater; + out[3] -= kPrime[3] & is_greater; + out[4] -= kPrime[4] & is_greater; + out[5] -= kPrime[5] & is_greater; + out[6] -= kPrime[6] & is_greater; + out[7] -= kPrime[7] & is_greater; + out[8] -= kPrime[8] & is_greater; + + /* Eliminate negative coefficients */ + sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign); + sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign); + sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign); + sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign); + sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign); + sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); + sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); + sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); + sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign); + sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign); + sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign); + } + +/* Group operations + * ---------------- + * + * Building on top of the field operations we have the operations on the + * elliptic curve group itself. Points on the curve are represented in Jacobian + * coordinates */ + +/* point_double calcuates 2*(x_in, y_in, z_in) + * + * The method is taken from: + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b + * + * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. + * while x_out == y_in is not (maybe this works, but it's not tested). */ +static void +point_double(felem x_out, felem y_out, felem z_out, + const felem x_in, const felem y_in, const felem z_in) + { + largefelem tmp, tmp2; + felem delta, gamma, beta, alpha, ftmp, ftmp2; + + felem_assign(ftmp, x_in); + felem_assign(ftmp2, x_in); + + /* delta = z^2 */ + felem_square(tmp, z_in); + felem_reduce(delta, tmp); /* delta[i] < 2^59 + 2^14 */ + + /* gamma = y^2 */ + felem_square(tmp, y_in); + felem_reduce(gamma, tmp); /* gamma[i] < 2^59 + 2^14 */ + + /* beta = x*gamma */ + felem_mul(tmp, x_in, gamma); + felem_reduce(beta, tmp); /* beta[i] < 2^59 + 2^14 */ + + /* alpha = 3*(x-delta)*(x+delta) */ + felem_diff64(ftmp, delta); + /* ftmp[i] < 2^61 */ + felem_sum64(ftmp2, delta); + /* ftmp2[i] < 2^60 + 2^15 */ + felem_scalar64(ftmp2, 3); + /* ftmp2[i] < 3*2^60 + 3*2^15 */ + felem_mul(tmp, ftmp, ftmp2); + /* tmp[i] < 17(3*2^121 + 3*2^76) + * = 61*2^121 + 61*2^76 + * < 64*2^121 + 64*2^76 + * = 2^127 + 2^82 + * < 2^128 */ + felem_reduce(alpha, tmp); + + /* x' = alpha^2 - 8*beta */ + felem_square(tmp, alpha); + /* tmp[i] < 17*2^120 + * < 2^125 */ + felem_assign(ftmp, beta); + felem_scalar64(ftmp, 8); + /* ftmp[i] < 2^62 + 2^17 */ + felem_diff_128_64(tmp, ftmp); + /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */ + felem_reduce(x_out, tmp); + + /* z' = (y + z)^2 - gamma - delta */ + felem_sum64(delta, gamma); + /* delta[i] < 2^60 + 2^15 */ + felem_assign(ftmp, y_in); + felem_sum64(ftmp, z_in); + /* ftmp[i] < 2^60 + 2^15 */ + felem_square(tmp, ftmp); + /* tmp[i] < 17(2^122) + * < 2^127 */ + felem_diff_128_64(tmp, delta); + /* tmp[i] < 2^127 + 2^63 */ + felem_reduce(z_out, tmp); + + /* y' = alpha*(4*beta - x') - 8*gamma^2 */ + felem_scalar64(beta, 4); + /* beta[i] < 2^61 + 2^16 */ + felem_diff64(beta, x_out); + /* beta[i] < 2^61 + 2^60 + 2^16 */ + felem_mul(tmp, alpha, beta); + /* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16)) + * = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30) + * = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30) + * < 2^128 */ + felem_square(tmp2, gamma); + /* tmp2[i] < 17*(2^59 + 2^14)^2 + * = 17*(2^118 + 2^74 + 2^28) */ + felem_scalar128(tmp2, 8); + /* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28) + * = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31 + * < 2^126 */ + felem_diff128(tmp, tmp2); + /* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30) + * = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 + + * 2^74 + 2^69 + 2^34 + 2^30 + * < 2^128 */ + felem_reduce(y_out, tmp); + } + +/* copy_conditional copies in to out iff mask is all ones. */ +static void +copy_conditional(felem out, const felem in, limb mask) + { + unsigned i; + for (i = 0; i < NLIMBS; ++i) + { + const limb tmp = mask & (in[i] ^ out[i]); + out[i] ^= tmp; + } + } + +/* point_add calcuates (x1, y1, z1) + (x2, y2, z2) + * + * The method is taken from + * http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, + * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). + * + * This function includes a branch for checking whether the two input points + * are equal (while not equal to the point at infinity). This case never + * happens during single point multiplication, so there is no timing leak for + * ECDH or ECDSA signing. */ +static void point_add(felem x3, felem y3, felem z3, + const felem x1, const felem y1, const felem z1, + const int mixed, const felem x2, const felem y2, const felem z2) + { + felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out; + largefelem tmp, tmp2; + limb x_equal, y_equal, z1_is_zero, z2_is_zero; + + z1_is_zero = felem_is_zero(z1); + z2_is_zero = felem_is_zero(z2); + + /* ftmp = z1z1 = z1**2 */ + felem_square(tmp, z1); + felem_reduce(ftmp, tmp); + + if (!mixed) + { + /* ftmp2 = z2z2 = z2**2 */ + felem_square(tmp, z2); + felem_reduce(ftmp2, tmp); + + /* u1 = ftmp3 = x1*z2z2 */ + felem_mul(tmp, x1, ftmp2); + felem_reduce(ftmp3, tmp); + + /* ftmp5 = z1 + z2 */ + felem_assign(ftmp5, z1); + felem_sum64(ftmp5, z2); + /* ftmp5[i] < 2^61 */ + + /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */ + felem_square(tmp, ftmp5); + /* tmp[i] < 17*2^122 */ + felem_diff_128_64(tmp, ftmp); + /* tmp[i] < 17*2^122 + 2^63 */ + felem_diff_128_64(tmp, ftmp2); + /* tmp[i] < 17*2^122 + 2^64 */ + felem_reduce(ftmp5, tmp); + + /* ftmp2 = z2 * z2z2 */ + felem_mul(tmp, ftmp2, z2); + felem_reduce(ftmp2, tmp); + + /* s1 = ftmp6 = y1 * z2**3 */ + felem_mul(tmp, y1, ftmp2); + felem_reduce(ftmp6, tmp); + } + else + { + /* We'll assume z2 = 1 (special case z2 = 0 is handled later) */ + + /* u1 = ftmp3 = x1*z2z2 */ + felem_assign(ftmp3, x1); + + /* ftmp5 = 2*z1z2 */ + felem_scalar(ftmp5, z1, 2); + + /* s1 = ftmp6 = y1 * z2**3 */ + felem_assign(ftmp6, y1); + } + + /* u2 = x2*z1z1 */ + felem_mul(tmp, x2, ftmp); + /* tmp[i] < 17*2^120 */ + + /* h = ftmp4 = u2 - u1 */ + felem_diff_128_64(tmp, ftmp3); + /* tmp[i] < 17*2^120 + 2^63 */ + felem_reduce(ftmp4, tmp); + + x_equal = felem_is_zero(ftmp4); + + /* z_out = ftmp5 * h */ + felem_mul(tmp, ftmp5, ftmp4); + felem_reduce(z_out, tmp); + + /* ftmp = z1 * z1z1 */ + felem_mul(tmp, ftmp, z1); + felem_reduce(ftmp, tmp); + + /* s2 = tmp = y2 * z1**3 */ + felem_mul(tmp, y2, ftmp); + /* tmp[i] < 17*2^120 */ + + /* r = ftmp5 = (s2 - s1)*2 */ + felem_diff_128_64(tmp, ftmp6); + /* tmp[i] < 17*2^120 + 2^63 */ + felem_reduce(ftmp5, tmp); + y_equal = felem_is_zero(ftmp5); + felem_scalar64(ftmp5, 2); + /* ftmp5[i] < 2^61 */ + + if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) + { + point_double(x3, y3, z3, x1, y1, z1); + return; + } + + /* I = ftmp = (2h)**2 */ + felem_assign(ftmp, ftmp4); + felem_scalar64(ftmp, 2); + /* ftmp[i] < 2^61 */ + felem_square(tmp, ftmp); + /* tmp[i] < 17*2^122 */ + felem_reduce(ftmp, tmp); + + /* J = ftmp2 = h * I */ + felem_mul(tmp, ftmp4, ftmp); + felem_reduce(ftmp2, tmp); + + /* V = ftmp4 = U1 * I */ + felem_mul(tmp, ftmp3, ftmp); + felem_reduce(ftmp4, tmp); + + /* x_out = r**2 - J - 2V */ + felem_square(tmp, ftmp5); + /* tmp[i] < 17*2^122 */ + felem_diff_128_64(tmp, ftmp2); + /* tmp[i] < 17*2^122 + 2^63 */ + felem_assign(ftmp3, ftmp4); + felem_scalar64(ftmp4, 2); + /* ftmp4[i] < 2^61 */ + felem_diff_128_64(tmp, ftmp4); + /* tmp[i] < 17*2^122 + 2^64 */ + felem_reduce(x_out, tmp); + + /* y_out = r(V-x_out) - 2 * s1 * J */ + felem_diff64(ftmp3, x_out); + /* ftmp3[i] < 2^60 + 2^60 + * = 2^61 */ + felem_mul(tmp, ftmp5, ftmp3); + /* tmp[i] < 17*2^122 */ + felem_mul(tmp2, ftmp6, ftmp2); + /* tmp2[i] < 17*2^120 */ + felem_scalar128(tmp2, 2); + /* tmp2[i] < 17*2^121 */ + felem_diff128(tmp, tmp2); + /* tmp[i] < 2^127 - 2^69 + 17*2^122 + * = 2^126 - 2^122 - 2^6 - 2^2 - 1 + * < 2^127 */ + felem_reduce(y_out, tmp); + + copy_conditional(x_out, x2, z1_is_zero); + copy_conditional(x_out, x1, z2_is_zero); + copy_conditional(y_out, y2, z1_is_zero); + copy_conditional(y_out, y1, z2_is_zero); + copy_conditional(z_out, z2, z1_is_zero); + copy_conditional(z_out, z1, z2_is_zero); + felem_assign(x3, x_out); + felem_assign(y3, y_out); + felem_assign(z3, z_out); + } + +/* Base point pre computation + * -------------------------- + * + * Two different sorts of precomputed tables are used in the following code. + * Each contain various points on the curve, where each point is three field + * elements (x, y, z). + * + * For the base point table, z is usually 1 (0 for the point at infinity). + * This table has 16 elements: + * index | bits | point + * ------+---------+------------------------------ + * 0 | 0 0 0 0 | 0G + * 1 | 0 0 0 1 | 1G + * 2 | 0 0 1 0 | 2^130G + * 3 | 0 0 1 1 | (2^130 + 1)G + * 4 | 0 1 0 0 | 2^260G + * 5 | 0 1 0 1 | (2^260 + 1)G + * 6 | 0 1 1 0 | (2^260 + 2^130)G + * 7 | 0 1 1 1 | (2^260 + 2^130 + 1)G + * 8 | 1 0 0 0 | 2^390G + * 9 | 1 0 0 1 | (2^390 + 1)G + * 10 | 1 0 1 0 | (2^390 + 2^130)G + * 11 | 1 0 1 1 | (2^390 + 2^130 + 1)G + * 12 | 1 1 0 0 | (2^390 + 2^260)G + * 13 | 1 1 0 1 | (2^390 + 2^260 + 1)G + * 14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G + * 15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G + * + * The reason for this is so that we can clock bits into four different + * locations when doing simple scalar multiplies against the base point. + * + * Tables for other points have table[i] = iG for i in 0 .. 16. */ + +/* gmul is the table of precomputed base points */ +static const felem gmul[16][3] = + {{{0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334, + 0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8, + 0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404}, + {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353, + 0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45, + 0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad, + 0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e, + 0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5}, + {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58, + 0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c, + 0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873, + 0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c, + 0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9}, + {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52, + 0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e, + 0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2, + 0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561, + 0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065}, + {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a, + 0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e, + 0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6, + 0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51, + 0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe}, + {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d, + 0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c, + 0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27, + 0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f, + 0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256}, + {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa, + 0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2, + 0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890, + 0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74, + 0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23}, + {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516, + 0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1, + 0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce, + 0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7, + 0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5}, + {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318, + 0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83, + 0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae, + 0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef, + 0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203}, + {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447, + 0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283, + 0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5, + 0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c, + 0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a}, + {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df, + 0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645, + 0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292, + 0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422, + 0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b}, + {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30, + 0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb, + 0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767, + 0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3, + 0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf}, + {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2, + 0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692, + 0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3, + 0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade, + 0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684}, + {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8, + 0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a, + 0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608, + 0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610, + 0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d}, + {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006, + 0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86, + 0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}, + {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c, + 0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9, + 0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f}, + {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7, + 0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c, + 0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055}, + {1, 0, 0, 0, 0, 0, 0, 0, 0}}}; + +/* select_point selects the |idx|th point from a precomputation table and + * copies it to out. */ +static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3], + felem out[3]) + { + unsigned i, j; + limb *outlimbs = &out[0][0]; + memset(outlimbs, 0, 3 * sizeof(felem)); + + for (i = 0; i < size; i++) + { + const limb *inlimbs = &pre_comp[i][0][0]; + limb mask = i ^ idx; + mask |= mask >> 4; + mask |= mask >> 2; + mask |= mask >> 1; + mask &= 1; + mask--; + for (j = 0; j < NLIMBS * 3; j++) + outlimbs[j] |= inlimbs[j] & mask; + } + } + +/* get_bit returns the |i|th bit in |in| */ +static char get_bit(const felem_bytearray in, int i) + { + if (i < 0) + return 0; + return (in[i >> 3] >> (i & 7)) & 1; + } + +/* Interleaved point multiplication using precomputed point multiples: + * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], + * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple + * of the generator, using certain (large) precomputed multiples in g_pre_comp. + * Output point (X, Y, Z) is stored in x_out, y_out, z_out */ +static void batch_mul(felem x_out, felem y_out, felem z_out, + const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar, + const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3]) + { + int i, skip; + unsigned num, gen_mul = (g_scalar != NULL); + felem nq[3], tmp[4]; + limb bits; + u8 sign, digit; + + /* set nq to the point at infinity */ + memset(nq, 0, 3 * sizeof(felem)); + + /* Loop over all scalars msb-to-lsb, interleaving additions + * of multiples of the generator (last quarter of rounds) + * and additions of other points multiples (every 5th round). + */ + skip = 1; /* save two point operations in the first round */ + for (i = (num_points ? 520 : 130); i >= 0; --i) + { + /* double */ + if (!skip) + point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + + /* add multiples of the generator */ + if (gen_mul && (i <= 130)) + { + bits = get_bit(g_scalar, i + 390) << 3; + if (i < 130) + { + bits |= get_bit(g_scalar, i + 260) << 2; + bits |= get_bit(g_scalar, i + 130) << 1; + bits |= get_bit(g_scalar, i); + } + /* select the point to add, in constant time */ + select_point(bits, 16, g_pre_comp, tmp); + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + 1 /* mixed */, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + } + + /* do other additions every 5 doublings */ + if (num_points && (i % 5 == 0)) + { + /* loop over all scalars */ + for (num = 0; num < num_points; ++num) + { + bits = get_bit(scalars[num], i + 4) << 5; + bits |= get_bit(scalars[num], i + 3) << 4; + bits |= get_bit(scalars[num], i + 2) << 3; + bits |= get_bit(scalars[num], i + 1) << 2; + bits |= get_bit(scalars[num], i) << 1; + bits |= get_bit(scalars[num], i - 1); + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + /* select the point to add or subtract, in constant time */ + select_point(digit, 17, pre_comp[num], tmp); + felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */ + copy_conditional(tmp[1], tmp[3], (-(limb) sign)); + + if (!skip) + { + point_add(nq[0], nq[1], nq[2], + nq[0], nq[1], nq[2], + mixed, tmp[0], tmp[1], tmp[2]); + } + else + { + memcpy(nq, tmp, 3 * sizeof(felem)); + skip = 0; + } + } + } + } + felem_assign(x_out, nq[0]); + felem_assign(y_out, nq[1]); + felem_assign(z_out, nq[2]); + } + + +/* Precomputation for the group generator. */ +typedef struct { + felem g_pre_comp[16][3]; + int references; +} NISTP521_PRE_COMP; + +const EC_METHOD *EC_GFp_nistp521_method(void) + { + static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, + NID_X9_62_prime_field, + ec_GFp_nistp521_group_init, + ec_GFp_simple_group_finish, + ec_GFp_simple_group_clear_finish, + ec_GFp_nist_group_copy, + ec_GFp_nistp521_group_set_curve, + ec_GFp_simple_group_get_curve, + ec_GFp_simple_group_get_degree, + ec_GFp_simple_group_check_discriminant, + ec_GFp_simple_point_init, + ec_GFp_simple_point_finish, + ec_GFp_simple_point_clear_finish, + ec_GFp_simple_point_copy, + ec_GFp_simple_point_set_to_infinity, + ec_GFp_simple_set_Jprojective_coordinates_GFp, + ec_GFp_simple_get_Jprojective_coordinates_GFp, + ec_GFp_simple_point_set_affine_coordinates, + ec_GFp_nistp521_point_get_affine_coordinates, + 0 /* point_set_compressed_coordinates */, + 0 /* point2oct */, + 0 /* oct2point */, + ec_GFp_simple_add, + ec_GFp_simple_dbl, + ec_GFp_simple_invert, + ec_GFp_simple_is_at_infinity, + ec_GFp_simple_is_on_curve, + ec_GFp_simple_cmp, + ec_GFp_simple_make_affine, + ec_GFp_simple_points_make_affine, + ec_GFp_nistp521_points_mul, + ec_GFp_nistp521_precompute_mult, + ec_GFp_nistp521_have_precompute_mult, + ec_GFp_nist_field_mul, + ec_GFp_nist_field_sqr, + 0 /* field_div */, + 0 /* field_encode */, + 0 /* field_decode */, + 0 /* field_set_to_one */ }; + + return &ret; + } + + +/******************************************************************************/ +/* FUNCTIONS TO MANAGE PRECOMPUTATION + */ + +static NISTP521_PRE_COMP *nistp521_pre_comp_new() + { + NISTP521_PRE_COMP *ret = NULL; + ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP)); + if (!ret) + { + ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE); + return ret; + } + memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp)); + ret->references = 1; + return ret; + } + +static void *nistp521_pre_comp_dup(void *src_) + { + NISTP521_PRE_COMP *src = src_; + + /* no need to actually copy, these objects never change! */ + CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP); + + return src_; + } + +static void nistp521_pre_comp_free(void *pre_) + { + int i; + NISTP521_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_free(pre); + } + +static void nistp521_pre_comp_clear_free(void *pre_) + { + int i; + NISTP521_PRE_COMP *pre = pre_; + + if (!pre) + return; + + i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP); + if (i > 0) + return; + + OPENSSL_cleanse(pre, sizeof(*pre)); + OPENSSL_free(pre); + } + +/******************************************************************************/ +/* OPENSSL EC_METHOD FUNCTIONS + */ + +int ec_GFp_nistp521_group_init(EC_GROUP *group) + { + int ret; + ret = ec_GFp_simple_group_init(group); + group->a_is_minus3 = 1; + return ret; + } + +int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) + { + int ret = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *curve_p, *curve_a, *curve_b; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((curve_p = BN_CTX_get(ctx)) == NULL) || + ((curve_a = BN_CTX_get(ctx)) == NULL) || + ((curve_b = BN_CTX_get(ctx)) == NULL)) goto err; + BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p); + BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a); + BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b); + if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || + (BN_cmp(curve_b, b))) + { + ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE, + EC_R_WRONG_CURVE_PARAMETERS); + goto err; + } + group->field_mod_func = BN_nist_mod_521; + ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx); +err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + +/* Takes the Jacobian coordinates (X, Y, Z) of a point and returns + * (X', Y') = (X/Z^2, Y/Z^3) */ +int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx) + { + felem z1, z2, x_in, y_in, x_out, y_out; + largefelem tmp; + + if (EC_POINT_is_at_infinity(group, point)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, + EC_R_POINT_AT_INFINITY); + return 0; + } + if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) || + (!BN_to_felem(z1, &point->Z))) return 0; + felem_inv(z2, z1); + felem_square(tmp, z2); felem_reduce(z1, tmp); + felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp); + felem_contract(x_out, x_in); + if (x != NULL) + { + if (!felem_to_BN(x, x_out)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); + return 0; + } + } + felem_mul(tmp, z1, z2); felem_reduce(z1, tmp); + felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp); + felem_contract(y_out, y_in); + if (y != NULL) + { + if (!felem_to_BN(y, y_out)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB); + return 0; + } + } + return 1; + } + +static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */]) + { + /* Runs in constant time, unless an input is the point at infinity + * (which normally shouldn't happen). */ + ec_GFp_nistp_points_make_affine_internal( + num, + points, + sizeof(felem), + tmp_felems, + (void (*)(void *)) felem_one, + (int (*)(const void *)) felem_is_zero_int, + (void (*)(void *, const void *)) felem_assign, + (void (*)(void *, const void *)) felem_square_reduce, + (void (*)(void *, const void *, const void *)) felem_mul_reduce, + (void (*)(void *, const void *)) felem_inv, + (void (*)(void *, const void *)) felem_contract); + } + +/* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values + * Result is stored in r (r can equal one of the inputs). */ +int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *scalar, size_t num, const EC_POINT *points[], + const BIGNUM *scalars[], BN_CTX *ctx) + { + int ret = 0; + int j; + int mixed = 0; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y, *z, *tmp_scalar; + felem_bytearray g_secret; + felem_bytearray *secrets = NULL; + felem (*pre_comp)[17][3] = NULL; + felem *tmp_felems = NULL; + felem_bytearray tmp; + unsigned i, num_bytes; + int have_pre_comp = 0; + size_t num_points = num; + felem x_in, y_in, z_in, x_out, y_out, z_out; + NISTP521_PRE_COMP *pre = NULL; + felem (*g_pre_comp)[3] = NULL; + EC_POINT *generator = NULL; + const EC_POINT *p = NULL; + const BIGNUM *p_scalar = NULL; + + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL) || + ((z = BN_CTX_get(ctx)) == NULL) || + ((tmp_scalar = BN_CTX_get(ctx)) == NULL)) + goto err; + + if (scalar != NULL) + { + pre = EC_EX_DATA_get_data(group->extra_data, + nistp521_pre_comp_dup, nistp521_pre_comp_free, + nistp521_pre_comp_clear_free); + if (pre) + /* we have precomputation, try to use it */ + g_pre_comp = &pre->g_pre_comp[0]; + else + /* try to use the standard precomputation */ + g_pre_comp = (felem (*)[3]) gmul; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + /* get the generator from precomputation */ + if (!felem_to_BN(x, g_pre_comp[1][0]) || + !felem_to_BN(y, g_pre_comp[1][1]) || + !felem_to_BN(z, g_pre_comp[1][2])) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + if (!EC_POINT_set_Jprojective_coordinates_GFp(group, + generator, x, y, z, ctx)) + goto err; + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + /* precomputation matches generator */ + have_pre_comp = 1; + else + /* we don't have valid precomputation: + * treat the generator as a random point */ + num_points++; + } + + if (num_points > 0) + { + if (num_points >= 2) + { + /* unless we precompute multiples for just one point, + * converting those into affine form is time well spent */ + mixed = 1; + } + secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray)); + pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem)); + if (mixed) + tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem)); + if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL))) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE); + goto err; + } + + /* we treat NULL scalars as 0, and NULL points as points at infinity, + * i.e., they contribute nothing to the linear combination */ + memset(secrets, 0, num_points * sizeof(felem_bytearray)); + memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem)); + for (i = 0; i < num_points; ++i) + { + if (i == num) + /* we didn't have a valid precomputation, so we pick + * the generator */ + { + p = EC_GROUP_get0_generator(group); + p_scalar = scalar; + } + else + /* the i^th point */ + { + p = points[i]; + p_scalar = scalars[i]; + } + if ((p_scalar != NULL) && (p != NULL)) + { + /* reduce scalar to 0 <= scalar < 2^521 */ + if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(p_scalar, tmp); + flip_endian(secrets[i], tmp, num_bytes); + /* precompute multiples */ + if ((!BN_to_felem(x_out, &p->X)) || + (!BN_to_felem(y_out, &p->Y)) || + (!BN_to_felem(z_out, &p->Z))) goto err; + memcpy(pre_comp[i][1][0], x_out, sizeof(felem)); + memcpy(pre_comp[i][1][1], y_out, sizeof(felem)); + memcpy(pre_comp[i][1][2], z_out, sizeof(felem)); + for (j = 2; j <= 16; ++j) + { + if (j & 1) + { + point_add( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2], + 0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]); + } + else + { + point_double( + pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2], + pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]); + } + } + } + } + if (mixed) + make_points_affine(num_points * 17, pre_comp[0], tmp_felems); + } + + /* the scalar for the generator */ + if ((scalar != NULL) && (have_pre_comp)) + { + memset(g_secret, 0, sizeof(g_secret)); + /* reduce scalar to 0 <= scalar < 2^521 */ + if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) + { + /* this is an unusual input, and we don't guarantee + * constant-timeness */ + if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + num_bytes = BN_bn2bin(tmp_scalar, tmp); + } + else + num_bytes = BN_bn2bin(scalar, tmp); + flip_endian(g_secret, tmp, num_bytes); + /* do the multiplication with generator precomputation*/ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + g_secret, + mixed, (const felem (*)[17][3]) pre_comp, + (const felem (*)[3]) g_pre_comp); + } + else + /* do the multiplication without generator precomputation */ + batch_mul(x_out, y_out, z_out, + (const felem_bytearray (*)) secrets, num_points, + NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL); + /* reduce the output to its unique minimal representation */ + felem_contract(x_in, x_out); + felem_contract(y_in, y_out); + felem_contract(z_in, z_out); + if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) || + (!felem_to_BN(z, z_in))) + { + ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB); + goto err; + } + ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx); + +err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (secrets != NULL) + OPENSSL_free(secrets); + if (pre_comp != NULL) + OPENSSL_free(pre_comp); + if (tmp_felems != NULL) + OPENSSL_free(tmp_felems); + return ret; + } + +int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx) + { + int ret = 0; + NISTP521_PRE_COMP *pre = NULL; + int i, j; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + EC_POINT *generator = NULL; + felem tmp_felems[16]; + + /* throw away old precomputation */ + EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup, + nistp521_pre_comp_free, nistp521_pre_comp_clear_free); + if (ctx == NULL) + if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0; + BN_CTX_start(ctx); + if (((x = BN_CTX_get(ctx)) == NULL) || + ((y = BN_CTX_get(ctx)) == NULL)) + goto err; + /* get the generator */ + if (group->generator == NULL) goto err; + generator = EC_POINT_new(group); + if (generator == NULL) + goto err; + BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x); + BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y); + if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx)) + goto err; + if ((pre = nistp521_pre_comp_new()) == NULL) + goto err; + /* if the generator is the standard one, use built-in precomputation */ + if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) + { + memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp)); + ret = 1; + goto err; + } + if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) || + (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) || + (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z))) + goto err; + /* compute 2^130*G, 2^260*G, 2^390*G */ + for (i = 1; i <= 4; i <<= 1) + { + point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1], + pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0], + pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]); + for (j = 0; j < 129; ++j) + { + point_double(pre->g_pre_comp[2*i][0], + pre->g_pre_comp[2*i][1], + pre->g_pre_comp[2*i][2], + pre->g_pre_comp[2*i][0], + pre->g_pre_comp[2*i][1], + pre->g_pre_comp[2*i][2]); + } + } + /* g_pre_comp[0] is the point at infinity */ + memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0])); + /* the remaining multiples */ + /* 2^130*G + 2^260*G */ + point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1], + pre->g_pre_comp[6][2], pre->g_pre_comp[4][0], + pre->g_pre_comp[4][1], pre->g_pre_comp[4][2], + 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], + pre->g_pre_comp[2][2]); + /* 2^130*G + 2^390*G */ + point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], + pre->g_pre_comp[10][2], pre->g_pre_comp[8][0], + pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], + 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], + pre->g_pre_comp[2][2]); + /* 2^260*G + 2^390*G */ + point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], + pre->g_pre_comp[12][2], pre->g_pre_comp[8][0], + pre->g_pre_comp[8][1], pre->g_pre_comp[8][2], + 0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1], + pre->g_pre_comp[4][2]); + /* 2^130*G + 2^260*G + 2^390*G */ + point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], + pre->g_pre_comp[14][2], pre->g_pre_comp[12][0], + pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], + 0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1], + pre->g_pre_comp[2][2]); + for (i = 1; i < 8; ++i) + { + /* odd multiples: add G */ + point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1], + pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0], + pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2], + 0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1], + pre->g_pre_comp[1][2]); + } + make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems); + + if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup, + nistp521_pre_comp_free, nistp521_pre_comp_clear_free)) + goto err; + ret = 1; + pre = NULL; + err: + BN_CTX_end(ctx); + if (generator != NULL) + EC_POINT_free(generator); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + if (pre) + nistp521_pre_comp_free(pre); + return ret; + } + +int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group) + { + if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup, + nistp521_pre_comp_free, nistp521_pre_comp_clear_free) + != NULL) + return 1; + else + return 0; + } + +#else +static void *dummy=&dummy; +#endif diff --git a/src/lib/libcrypto/ec/ecp_nistputil.c b/src/lib/libcrypto/ec/ecp_nistputil.c new file mode 100644 index 0000000000..c8140c807f --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_nistputil.c @@ -0,0 +1,197 @@ +/* crypto/ec/ecp_nistputil.c */ +/* + * Written by Bodo Moeller for the OpenSSL project. + */ +/* Copyright 2011 Google Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#ifndef OPENSSL_NO_EC_NISTP_64_GCC_128 + +/* + * Common utility functions for ecp_nistp224.c, ecp_nistp256.c, ecp_nistp521.c. + */ + +#include +#include "ec_lcl.h" + +/* Convert an array of points into affine coordinates. + * (If the point at infinity is found (Z = 0), it remains unchanged.) + * This function is essentially an equivalent to EC_POINTs_make_affine(), but + * works with the internal representation of points as used by ecp_nistp###.c + * rather than with (BIGNUM-based) EC_POINT data structures. + * + * point_array is the input/output buffer ('num' points in projective form, + * i.e. three coordinates each), based on an internal representation of + * field elements of size 'felem_size'. + * + * tmp_felems needs to point to a temporary array of 'num'+1 field elements + * for storage of intermediate values. + */ +void ec_GFp_nistp_points_make_affine_internal(size_t num, void *point_array, + size_t felem_size, void *tmp_felems, + void (*felem_one)(void *out), + int (*felem_is_zero)(const void *in), + void (*felem_assign)(void *out, const void *in), + void (*felem_square)(void *out, const void *in), + void (*felem_mul)(void *out, const void *in1, const void *in2), + void (*felem_inv)(void *out, const void *in), + void (*felem_contract)(void *out, const void *in)) + { + int i = 0; + +#define tmp_felem(I) (&((char *)tmp_felems)[(I) * felem_size]) +#define X(I) (&((char *)point_array)[3*(I) * felem_size]) +#define Y(I) (&((char *)point_array)[(3*(I) + 1) * felem_size]) +#define Z(I) (&((char *)point_array)[(3*(I) + 2) * felem_size]) + + if (!felem_is_zero(Z(0))) + felem_assign(tmp_felem(0), Z(0)); + else + felem_one(tmp_felem(0)); + for (i = 1; i < (int)num; i++) + { + if (!felem_is_zero(Z(i))) + felem_mul(tmp_felem(i), tmp_felem(i-1), Z(i)); + else + felem_assign(tmp_felem(i), tmp_felem(i-1)); + } + /* Now each tmp_felem(i) is the product of Z(0) .. Z(i), skipping any zero-valued factors: + * if Z(i) = 0, we essentially pretend that Z(i) = 1 */ + + felem_inv(tmp_felem(num-1), tmp_felem(num-1)); + for (i = num - 1; i >= 0; i--) + { + if (i > 0) + /* tmp_felem(i-1) is the product of Z(0) .. Z(i-1), + * tmp_felem(i) is the inverse of the product of Z(0) .. Z(i) + */ + felem_mul(tmp_felem(num), tmp_felem(i-1), tmp_felem(i)); /* 1/Z(i) */ + else + felem_assign(tmp_felem(num), tmp_felem(0)); /* 1/Z(0) */ + + if (!felem_is_zero(Z(i))) + { + if (i > 0) + /* For next iteration, replace tmp_felem(i-1) by its inverse */ + felem_mul(tmp_felem(i-1), tmp_felem(i), Z(i)); + + /* Convert point (X, Y, Z) into affine form (X/(Z^2), Y/(Z^3), 1) */ + felem_square(Z(i), tmp_felem(num)); /* 1/(Z^2) */ + felem_mul(X(i), X(i), Z(i)); /* X/(Z^2) */ + felem_mul(Z(i), Z(i), tmp_felem(num)); /* 1/(Z^3) */ + felem_mul(Y(i), Y(i), Z(i)); /* Y/(Z^3) */ + felem_contract(X(i), X(i)); + felem_contract(Y(i), Y(i)); + felem_one(Z(i)); + } + else + { + if (i > 0) + /* For next iteration, replace tmp_felem(i-1) by its inverse */ + felem_assign(tmp_felem(i-1), tmp_felem(i)); + } + } + } + +/* + * This function looks at 5+1 scalar bits (5 current, 1 adjacent less + * significant bit), and recodes them into a signed digit for use in fast point + * multiplication: the use of signed rather than unsigned digits means that + * fewer points need to be precomputed, given that point inversion is easy + * (a precomputed point dP makes -dP available as well). + * + * BACKGROUND: + * + * Signed digits for multiplication were introduced by Booth ("A signed binary + * multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, + * pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. + * Booth's original encoding did not generally improve the density of nonzero + * digits over the binary representation, and was merely meant to simplify the + * handling of signed factors given in two's complement; but it has since been + * shown to be the basis of various signed-digit representations that do have + * further advantages, including the wNAF, using the following general approach: + * + * (1) Given a binary representation + * + * b_k ... b_2 b_1 b_0, + * + * of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 + * by using bit-wise subtraction as follows: + * + * b_k b_(k-1) ... b_2 b_1 b_0 + * - b_k ... b_3 b_2 b_1 b_0 + * ------------------------------------- + * s_k b_(k-1) ... s_3 s_2 s_1 s_0 + * + * A left-shift followed by subtraction of the original value yields a new + * representation of the same value, using signed bits s_i = b_(i+1) - b_i. + * This representation from Booth's paper has since appeared in the + * literature under a variety of different names including "reversed binary + * form", "alternating greedy expansion", "mutual opposite form", and + * "sign-alternating {+-1}-representation". + * + * An interesting property is that among the nonzero bits, values 1 and -1 + * strictly alternate. + * + * (2) Various window schemes can be applied to the Booth representation of + * integers: for example, right-to-left sliding windows yield the wNAF + * (a signed-digit encoding independently discovered by various researchers + * in the 1990s), and left-to-right sliding windows yield a left-to-right + * equivalent of the wNAF (independently discovered by various researchers + * around 2004). + * + * To prevent leaking information through side channels in point multiplication, + * we need to recode the given integer into a regular pattern: sliding windows + * as in wNAFs won't do, we need their fixed-window equivalent -- which is a few + * decades older: we'll be using the so-called "modified Booth encoding" due to + * MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 + * (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five + * signed bits into a signed digit: + * + * s_(4j + 4) s_(4j + 3) s_(4j + 2) s_(4j + 1) s_(4j) + * + * The sign-alternating property implies that the resulting digit values are + * integers from -16 to 16. + * + * Of course, we don't actually need to compute the signed digits s_i as an + * intermediate step (that's just a nice way to see how this scheme relates + * to the wNAF): a direct computation obtains the recoded digit from the + * six bits b_(4j + 4) ... b_(4j - 1). + * + * This function takes those five bits as an integer (0 .. 63), writing the + * recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute + * value, in the range 0 .. 8). Note that this integer essentially provides the + * input bits "shifted to the left" by one position: for example, the input to + * compute the least significant recoded digit, given that there's no bit b_-1, + * has to be b_4 b_3 b_2 b_1 b_0 0. + * + */ +void ec_GFp_nistp_recode_scalar_bits(unsigned char *sign, unsigned char *digit, unsigned char in) + { + unsigned char s, d; + + s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as 6-bit value */ + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + *sign = s & 1; + *digit = d; + } +#else +static void *dummy=&dummy; +#endif diff --git a/src/lib/libcrypto/ec/ecp_oct.c b/src/lib/libcrypto/ec/ecp_oct.c new file mode 100644 index 0000000000..374a0ee731 --- /dev/null +++ b/src/lib/libcrypto/ec/ecp_oct.c @@ -0,0 +1,433 @@ +/* crypto/ec/ecp_oct.c */ +/* Includes code written by Lenka Fibikova + * for the OpenSSL project. + * Includes code written by Bodo Moeller for the OpenSSL project. +*/ +/* ==================================================================== + * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* ==================================================================== + * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. + * Portions of this software developed by SUN MICROSYSTEMS, INC., + * and contributed to the OpenSSL project. + */ + +#include +#include + +#include "ec_lcl.h" + +int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x_, int y_bit, BN_CTX *ctx) + { + BN_CTX *new_ctx = NULL; + BIGNUM *tmp1, *tmp2, *x, *y; + int ret = 0; + + /* clear error queue*/ + ERR_clear_error(); + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + y_bit = (y_bit != 0); + + BN_CTX_start(ctx); + tmp1 = BN_CTX_get(ctx); + tmp2 = BN_CTX_get(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == NULL) goto err; + + /* Recover y. We have a Weierstrass equation + * y^2 = x^3 + a*x + b, + * so y is one of the square roots of x^3 + a*x + b. + */ + + /* tmp1 := x^3 */ + if (!BN_nnmod(x, x_, &group->field,ctx)) goto err; + if (group->meth->field_decode == 0) + { + /* field_{sqr,mul} work on standard representation */ + if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err; + if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err; + } + else + { + if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err; + if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err; + } + + /* tmp1 := tmp1 + a*x */ + if (group->a_is_minus3) + { + if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err; + if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err; + if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err; + } + else + { + if (group->meth->field_decode) + { + if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err; + if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err; + } + else + { + /* field_mul works on standard representation */ + if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err; + } + + if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; + } + + /* tmp1 := tmp1 + b */ + if (group->meth->field_decode) + { + if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err; + if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; + } + else + { + if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err; + } + + if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) + { + unsigned long err = ERR_peek_last_error(); + + if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE) + { + ERR_clear_error(); + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); + } + else + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); + goto err; + } + + if (y_bit != BN_is_odd(y)) + { + if (BN_is_zero(y)) + { + int kron; + + kron = BN_kronecker(x, &group->field, ctx); + if (kron == -2) goto err; + + if (kron == 1) + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT); + else + /* BN_mod_sqrt() should have cought this error (not a square) */ + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); + goto err; + } + if (!BN_usub(y, &group->field, y)) goto err; + } + if (y_bit != BN_is_odd(y)) + { + ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR); + goto err; + } + + if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + + +size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, + unsigned char *buf, size_t len, BN_CTX *ctx) + { + size_t ret; + BN_CTX *new_ctx = NULL; + int used_ctx = 0; + BIGNUM *x, *y; + size_t field_len, i, skip; + + if ((form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); + goto err; + } + + if (EC_POINT_is_at_infinity(group, point)) + { + /* encodes to a single 0 octet */ + if (buf != NULL) + { + if (len < 1) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + buf[0] = 0; + } + return 1; + } + + + /* ret := required output buffer length */ + field_len = BN_num_bytes(&group->field); + ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + /* if 'buf' is NULL, just return required length */ + if (buf != NULL) + { + if (len < ret) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); + goto err; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + used_ctx = 1; + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == NULL) goto err; + + if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; + + if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y)) + buf[0] = form + 1; + else + buf[0] = form; + + i = 1; + + skip = field_len - BN_num_bytes(x); + if (skip > field_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(x, buf + i); + i += skip; + if (i != 1 + field_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + + if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) + { + skip = field_len - BN_num_bytes(y); + if (skip > field_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + while (skip > 0) + { + buf[i++] = 0; + skip--; + } + skip = BN_bn2bin(y, buf + i); + i += skip; + } + + if (i != ret) + { + ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); + goto err; + } + } + + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + + err: + if (used_ctx) + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return 0; + } + + +int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, + const unsigned char *buf, size_t len, BN_CTX *ctx) + { + point_conversion_form_t form; + int y_bit; + BN_CTX *new_ctx = NULL; + BIGNUM *x, *y; + size_t field_len, enc_len; + int ret = 0; + + if (len == 0) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); + return 0; + } + form = buf[0]; + y_bit = form & 1; + form = form & ~1U; + if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) + && (form != POINT_CONVERSION_UNCOMPRESSED) + && (form != POINT_CONVERSION_HYBRID)) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (form == 0) + { + if (len != 1) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + return EC_POINT_set_to_infinity(group, point); + } + + field_len = BN_num_bytes(&group->field); + enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; + + if (len != enc_len) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + return 0; + } + + if (ctx == NULL) + { + ctx = new_ctx = BN_CTX_new(); + if (ctx == NULL) + return 0; + } + + BN_CTX_start(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == NULL) goto err; + + if (!BN_bin2bn(buf + 1, field_len, x)) goto err; + if (BN_ucmp(x, &group->field) >= 0) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + + if (form == POINT_CONVERSION_COMPRESSED) + { + if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err; + } + else + { + if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; + if (BN_ucmp(y, &group->field) >= 0) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + if (form == POINT_CONVERSION_HYBRID) + { + if (y_bit != BN_is_odd(y)) + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); + goto err; + } + } + + if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; + } + + if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ + { + ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); + goto err; + } + + ret = 1; + + err: + BN_CTX_end(ctx); + if (new_ctx != NULL) + BN_CTX_free(new_ctx); + return ret; + } + diff --git a/src/lib/libcrypto/ec/ecp_smpl.c b/src/lib/libcrypto/ec/ecp_smpl.c index 66a92e2a90..7cbb321f9a 100644 --- a/src/lib/libcrypto/ec/ecp_smpl.c +++ b/src/lib/libcrypto/ec/ecp_smpl.c @@ -65,11 +65,19 @@ #include #include +#ifdef OPENSSL_FIPS +#include +#endif + #include "ec_lcl.h" const EC_METHOD *EC_GFp_simple_method(void) { +#ifdef OPENSSL_FIPS + return fips_ec_gfp_simple_method(); +#else static const EC_METHOD ret = { + EC_FLAGS_DEFAULT_OCT, NID_X9_62_prime_field, ec_GFp_simple_group_init, ec_GFp_simple_group_finish, @@ -88,9 +96,7 @@ const EC_METHOD *EC_GFp_simple_method(void) ec_GFp_simple_get_Jprojective_coordinates_GFp, ec_GFp_simple_point_set_affine_coordinates, ec_GFp_simple_point_get_affine_coordinates, - ec_GFp_simple_set_compressed_coordinates, - ec_GFp_simple_point2oct, - ec_GFp_simple_oct2point, + 0,0,0, ec_GFp_simple_add, ec_GFp_simple_dbl, ec_GFp_simple_invert, @@ -110,6 +116,7 @@ const EC_METHOD *EC_GFp_simple_method(void) 0 /* field_set_to_one */ }; return &ret; +#endif } @@ -633,372 +640,6 @@ int ec_GFp_simple_point_get_affine_coordinates(const EC_GROUP *group, const EC_P return ret; } - -int ec_GFp_simple_set_compressed_coordinates(const EC_GROUP *group, EC_POINT *point, - const BIGNUM *x_, int y_bit, BN_CTX *ctx) - { - BN_CTX *new_ctx = NULL; - BIGNUM *tmp1, *tmp2, *x, *y; - int ret = 0; - - /* clear error queue*/ - ERR_clear_error(); - - if (ctx == NULL) - { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) - return 0; - } - - y_bit = (y_bit != 0); - - BN_CTX_start(ctx); - tmp1 = BN_CTX_get(ctx); - tmp2 = BN_CTX_get(ctx); - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - if (y == NULL) goto err; - - /* Recover y. We have a Weierstrass equation - * y^2 = x^3 + a*x + b, - * so y is one of the square roots of x^3 + a*x + b. - */ - - /* tmp1 := x^3 */ - if (!BN_nnmod(x, x_, &group->field,ctx)) goto err; - if (group->meth->field_decode == 0) - { - /* field_{sqr,mul} work on standard representation */ - if (!group->meth->field_sqr(group, tmp2, x_, ctx)) goto err; - if (!group->meth->field_mul(group, tmp1, tmp2, x_, ctx)) goto err; - } - else - { - if (!BN_mod_sqr(tmp2, x_, &group->field, ctx)) goto err; - if (!BN_mod_mul(tmp1, tmp2, x_, &group->field, ctx)) goto err; - } - - /* tmp1 := tmp1 + a*x */ - if (group->a_is_minus3) - { - if (!BN_mod_lshift1_quick(tmp2, x, &group->field)) goto err; - if (!BN_mod_add_quick(tmp2, tmp2, x, &group->field)) goto err; - if (!BN_mod_sub_quick(tmp1, tmp1, tmp2, &group->field)) goto err; - } - else - { - if (group->meth->field_decode) - { - if (!group->meth->field_decode(group, tmp2, &group->a, ctx)) goto err; - if (!BN_mod_mul(tmp2, tmp2, x, &group->field, ctx)) goto err; - } - else - { - /* field_mul works on standard representation */ - if (!group->meth->field_mul(group, tmp2, &group->a, x, ctx)) goto err; - } - - if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; - } - - /* tmp1 := tmp1 + b */ - if (group->meth->field_decode) - { - if (!group->meth->field_decode(group, tmp2, &group->b, ctx)) goto err; - if (!BN_mod_add_quick(tmp1, tmp1, tmp2, &group->field)) goto err; - } - else - { - if (!BN_mod_add_quick(tmp1, tmp1, &group->b, &group->field)) goto err; - } - - if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) - { - unsigned long err = ERR_peek_last_error(); - - if (ERR_GET_LIB(err) == ERR_LIB_BN && ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE) - { - ERR_clear_error(); - ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); - } - else - ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_BN_LIB); - goto err; - } - - if (y_bit != BN_is_odd(y)) - { - if (BN_is_zero(y)) - { - int kron; - - kron = BN_kronecker(x, &group->field, ctx); - if (kron == -2) goto err; - - if (kron == 1) - ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSION_BIT); - else - /* BN_mod_sqrt() should have cought this error (not a square) */ - ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, EC_R_INVALID_COMPRESSED_POINT); - goto err; - } - if (!BN_usub(y, &group->field, y)) goto err; - } - if (y_bit != BN_is_odd(y)) - { - ECerr(EC_F_EC_GFP_SIMPLE_SET_COMPRESSED_COORDINATES, ERR_R_INTERNAL_ERROR); - goto err; - } - - if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; - - ret = 1; - - err: - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return ret; - } - - -size_t ec_GFp_simple_point2oct(const EC_GROUP *group, const EC_POINT *point, point_conversion_form_t form, - unsigned char *buf, size_t len, BN_CTX *ctx) - { - size_t ret; - BN_CTX *new_ctx = NULL; - int used_ctx = 0; - BIGNUM *x, *y; - size_t field_len, i, skip; - - if ((form != POINT_CONVERSION_COMPRESSED) - && (form != POINT_CONVERSION_UNCOMPRESSED) - && (form != POINT_CONVERSION_HYBRID)) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_INVALID_FORM); - goto err; - } - - if (EC_POINT_is_at_infinity(group, point)) - { - /* encodes to a single 0 octet */ - if (buf != NULL) - { - if (len < 1) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); - return 0; - } - buf[0] = 0; - } - return 1; - } - - - /* ret := required output buffer length */ - field_len = BN_num_bytes(&group->field); - ret = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; - - /* if 'buf' is NULL, just return required length */ - if (buf != NULL) - { - if (len < ret) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, EC_R_BUFFER_TOO_SMALL); - goto err; - } - - if (ctx == NULL) - { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) - return 0; - } - - BN_CTX_start(ctx); - used_ctx = 1; - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - if (y == NULL) goto err; - - if (!EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; - - if ((form == POINT_CONVERSION_COMPRESSED || form == POINT_CONVERSION_HYBRID) && BN_is_odd(y)) - buf[0] = form + 1; - else - buf[0] = form; - - i = 1; - - skip = field_len - BN_num_bytes(x); - if (skip > field_len) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - while (skip > 0) - { - buf[i++] = 0; - skip--; - } - skip = BN_bn2bin(x, buf + i); - i += skip; - if (i != 1 + field_len) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - - if (form == POINT_CONVERSION_UNCOMPRESSED || form == POINT_CONVERSION_HYBRID) - { - skip = field_len - BN_num_bytes(y); - if (skip > field_len) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - while (skip > 0) - { - buf[i++] = 0; - skip--; - } - skip = BN_bn2bin(y, buf + i); - i += skip; - } - - if (i != ret) - { - ECerr(EC_F_EC_GFP_SIMPLE_POINT2OCT, ERR_R_INTERNAL_ERROR); - goto err; - } - } - - if (used_ctx) - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return ret; - - err: - if (used_ctx) - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return 0; - } - - -int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, - const unsigned char *buf, size_t len, BN_CTX *ctx) - { - point_conversion_form_t form; - int y_bit; - BN_CTX *new_ctx = NULL; - BIGNUM *x, *y; - size_t field_len, enc_len; - int ret = 0; - - if (len == 0) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_BUFFER_TOO_SMALL); - return 0; - } - form = buf[0]; - y_bit = form & 1; - form = form & ~1U; - if ((form != 0) && (form != POINT_CONVERSION_COMPRESSED) - && (form != POINT_CONVERSION_UNCOMPRESSED) - && (form != POINT_CONVERSION_HYBRID)) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - if ((form == 0 || form == POINT_CONVERSION_UNCOMPRESSED) && y_bit) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - - if (form == 0) - { - if (len != 1) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - - return EC_POINT_set_to_infinity(group, point); - } - - field_len = BN_num_bytes(&group->field); - enc_len = (form == POINT_CONVERSION_COMPRESSED) ? 1 + field_len : 1 + 2*field_len; - - if (len != enc_len) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - return 0; - } - - if (ctx == NULL) - { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) - return 0; - } - - BN_CTX_start(ctx); - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - if (y == NULL) goto err; - - if (!BN_bin2bn(buf + 1, field_len, x)) goto err; - if (BN_ucmp(x, &group->field) >= 0) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - goto err; - } - - if (form == POINT_CONVERSION_COMPRESSED) - { - if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) goto err; - } - else - { - if (!BN_bin2bn(buf + 1 + field_len, field_len, y)) goto err; - if (BN_ucmp(y, &group->field) >= 0) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - goto err; - } - if (form == POINT_CONVERSION_HYBRID) - { - if (y_bit != BN_is_odd(y)) - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_INVALID_ENCODING); - goto err; - } - } - - if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) goto err; - } - - if (!EC_POINT_is_on_curve(group, point, ctx)) /* test required by X9.62 */ - { - ECerr(EC_F_EC_GFP_SIMPLE_OCT2POINT, EC_R_POINT_IS_NOT_ON_CURVE); - goto err; - } - - ret = 1; - - err: - BN_CTX_end(ctx); - if (new_ctx != NULL) - BN_CTX_free(new_ctx); - return ret; - } - - int ec_GFp_simple_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, const EC_POINT *b, BN_CTX *ctx) { int (*field_mul)(const EC_GROUP *, BIGNUM *, const BIGNUM *, const BIGNUM *, BN_CTX *); diff --git a/src/lib/libcrypto/ecdh/ecdh.h b/src/lib/libcrypto/ecdh/ecdh.h index b4b58ee65b..8887102c0b 100644 --- a/src/lib/libcrypto/ecdh/ecdh.h +++ b/src/lib/libcrypto/ecdh/ecdh.h @@ -109,11 +109,13 @@ void ERR_load_ECDH_strings(void); /* Error codes for the ECDH functions. */ /* Function codes. */ +#define ECDH_F_ECDH_CHECK 102 #define ECDH_F_ECDH_COMPUTE_KEY 100 #define ECDH_F_ECDH_DATA_NEW_METHOD 101 /* Reason codes. */ #define ECDH_R_KDF_FAILED 102 +#define ECDH_R_NON_FIPS_METHOD 103 #define ECDH_R_NO_PRIVATE_VALUE 100 #define ECDH_R_POINT_ARITHMETIC_FAILURE 101 diff --git a/src/lib/libcrypto/ecdh/ech_err.c b/src/lib/libcrypto/ecdh/ech_err.c index 6f4b0c9953..3bd247398d 100644 --- a/src/lib/libcrypto/ecdh/ech_err.c +++ b/src/lib/libcrypto/ecdh/ech_err.c @@ -1,6 +1,6 @@ /* crypto/ecdh/ech_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,6 +70,7 @@ static ERR_STRING_DATA ECDH_str_functs[]= { +{ERR_FUNC(ECDH_F_ECDH_CHECK), "ECDH_CHECK"}, {ERR_FUNC(ECDH_F_ECDH_COMPUTE_KEY), "ECDH_compute_key"}, {ERR_FUNC(ECDH_F_ECDH_DATA_NEW_METHOD), "ECDH_DATA_new_method"}, {0,NULL} @@ -78,6 +79,7 @@ static ERR_STRING_DATA ECDH_str_functs[]= static ERR_STRING_DATA ECDH_str_reasons[]= { {ERR_REASON(ECDH_R_KDF_FAILED) ,"KDF failed"}, +{ERR_REASON(ECDH_R_NON_FIPS_METHOD) ,"non fips method"}, {ERR_REASON(ECDH_R_NO_PRIVATE_VALUE) ,"no private value"}, {ERR_REASON(ECDH_R_POINT_ARITHMETIC_FAILURE),"point arithmetic failure"}, {0,NULL} diff --git a/src/lib/libcrypto/ecdh/ech_lib.c b/src/lib/libcrypto/ecdh/ech_lib.c index 4d8ea03d3d..dadbfd3c49 100644 --- a/src/lib/libcrypto/ecdh/ech_lib.c +++ b/src/lib/libcrypto/ecdh/ech_lib.c @@ -73,6 +73,9 @@ #include #endif #include +#ifdef OPENSSL_FIPS +#include +#endif const char ECDH_version[]="ECDH" OPENSSL_VERSION_PTEXT; @@ -90,7 +93,16 @@ void ECDH_set_default_method(const ECDH_METHOD *meth) const ECDH_METHOD *ECDH_get_default_method(void) { if(!default_ECDH_method) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_ecdh_openssl(); + else + return ECDH_OpenSSL(); +#else default_ECDH_method = ECDH_OpenSSL(); +#endif + } return default_ECDH_method; } @@ -215,6 +227,14 @@ ECDH_DATA *ecdh_check(EC_KEY *key) } else ecdh_data = (ECDH_DATA *)data; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(ecdh_data->flags & ECDH_FLAG_FIPS_METHOD) + && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) + { + ECDHerr(ECDH_F_ECDH_CHECK, ECDH_R_NON_FIPS_METHOD); + return NULL; + } +#endif return ecdh_data; diff --git a/src/lib/libcrypto/ecdh/ech_locl.h b/src/lib/libcrypto/ecdh/ech_locl.h index f658526a7e..f6cad6a894 100644 --- a/src/lib/libcrypto/ecdh/ech_locl.h +++ b/src/lib/libcrypto/ecdh/ech_locl.h @@ -75,6 +75,14 @@ struct ecdh_method char *app_data; }; +/* If this flag is set the ECDH method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its responsibility + * to ensure the result is compliant. + */ + +#define ECDH_FLAG_FIPS_METHOD 0x1 + typedef struct ecdh_data_st { /* EC_KEY_METH_DATA part */ int (*init)(EC_KEY *); diff --git a/src/lib/libcrypto/ecdsa/ecdsa.h b/src/lib/libcrypto/ecdsa/ecdsa.h index e61c539812..7fb5254b62 100644 --- a/src/lib/libcrypto/ecdsa/ecdsa.h +++ b/src/lib/libcrypto/ecdsa/ecdsa.h @@ -238,6 +238,7 @@ void ERR_load_ECDSA_strings(void); /* Error codes for the ECDSA functions. */ /* Function codes. */ +#define ECDSA_F_ECDSA_CHECK 104 #define ECDSA_F_ECDSA_DATA_NEW_METHOD 100 #define ECDSA_F_ECDSA_DO_SIGN 101 #define ECDSA_F_ECDSA_DO_VERIFY 102 @@ -249,6 +250,7 @@ void ERR_load_ECDSA_strings(void); #define ECDSA_R_ERR_EC_LIB 102 #define ECDSA_R_MISSING_PARAMETERS 103 #define ECDSA_R_NEED_NEW_SETUP_VALUES 106 +#define ECDSA_R_NON_FIPS_METHOD 107 #define ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED 104 #define ECDSA_R_SIGNATURE_MALLOC_FAILED 105 diff --git a/src/lib/libcrypto/ecdsa/ecs_err.c b/src/lib/libcrypto/ecdsa/ecs_err.c index 98e38d537f..81542e6d15 100644 --- a/src/lib/libcrypto/ecdsa/ecs_err.c +++ b/src/lib/libcrypto/ecdsa/ecs_err.c @@ -1,6 +1,6 @@ /* crypto/ecdsa/ecs_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,6 +70,7 @@ static ERR_STRING_DATA ECDSA_str_functs[]= { +{ERR_FUNC(ECDSA_F_ECDSA_CHECK), "ECDSA_CHECK"}, {ERR_FUNC(ECDSA_F_ECDSA_DATA_NEW_METHOD), "ECDSA_DATA_NEW_METHOD"}, {ERR_FUNC(ECDSA_F_ECDSA_DO_SIGN), "ECDSA_do_sign"}, {ERR_FUNC(ECDSA_F_ECDSA_DO_VERIFY), "ECDSA_do_verify"}, @@ -84,6 +85,7 @@ static ERR_STRING_DATA ECDSA_str_reasons[]= {ERR_REASON(ECDSA_R_ERR_EC_LIB) ,"err ec lib"}, {ERR_REASON(ECDSA_R_MISSING_PARAMETERS) ,"missing parameters"}, {ERR_REASON(ECDSA_R_NEED_NEW_SETUP_VALUES),"need new setup values"}, +{ERR_REASON(ECDSA_R_NON_FIPS_METHOD) ,"non fips method"}, {ERR_REASON(ECDSA_R_RANDOM_NUMBER_GENERATION_FAILED),"random number generation failed"}, {ERR_REASON(ECDSA_R_SIGNATURE_MALLOC_FAILED),"signature malloc failed"}, {0,NULL} diff --git a/src/lib/libcrypto/ecdsa/ecs_lib.c b/src/lib/libcrypto/ecdsa/ecs_lib.c index 2ebae3aa27..e477da430b 100644 --- a/src/lib/libcrypto/ecdsa/ecs_lib.c +++ b/src/lib/libcrypto/ecdsa/ecs_lib.c @@ -60,6 +60,9 @@ #endif #include #include +#ifdef OPENSSL_FIPS +#include +#endif const char ECDSA_version[]="ECDSA" OPENSSL_VERSION_PTEXT; @@ -77,7 +80,16 @@ void ECDSA_set_default_method(const ECDSA_METHOD *meth) const ECDSA_METHOD *ECDSA_get_default_method(void) { if(!default_ECDSA_method) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_ecdsa_openssl(); + else + return ECDSA_OpenSSL(); +#else default_ECDSA_method = ECDSA_OpenSSL(); +#endif + } return default_ECDSA_method; } @@ -193,7 +205,14 @@ ECDSA_DATA *ecdsa_check(EC_KEY *key) } else ecdsa_data = (ECDSA_DATA *)data; - +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(ecdsa_data->flags & ECDSA_FLAG_FIPS_METHOD) + && !(EC_KEY_get_flags(key) & EC_FLAG_NON_FIPS_ALLOW)) + { + ECDSAerr(ECDSA_F_ECDSA_CHECK, ECDSA_R_NON_FIPS_METHOD); + return NULL; + } +#endif return ecdsa_data; } diff --git a/src/lib/libcrypto/ecdsa/ecs_locl.h b/src/lib/libcrypto/ecdsa/ecs_locl.h index 3a69a840e2..cb3be13cfc 100644 --- a/src/lib/libcrypto/ecdsa/ecs_locl.h +++ b/src/lib/libcrypto/ecdsa/ecs_locl.h @@ -82,6 +82,14 @@ struct ecdsa_method char *app_data; }; +/* If this flag is set the ECDSA method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its responsibility + * to ensure the result is compliant. + */ + +#define ECDSA_FLAG_FIPS_METHOD 0x1 + typedef struct ecdsa_data_st { /* EC_KEY_METH_DATA part */ int (*init)(EC_KEY *); diff --git a/src/lib/libcrypto/ecdsa/ecs_ossl.c b/src/lib/libcrypto/ecdsa/ecs_ossl.c index 1bbf328de5..7725935610 100644 --- a/src/lib/libcrypto/ecdsa/ecs_ossl.c +++ b/src/lib/libcrypto/ecdsa/ecs_ossl.c @@ -167,6 +167,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, goto err; } } +#ifndef OPENSSL_NO_EC2M else /* NID_X9_62_characteristic_two_field */ { if (!EC_POINT_get_affine_coordinates_GF2m(group, @@ -176,6 +177,7 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in, BIGNUM **kinvp, goto err; } } +#endif if (!BN_nnmod(r, X, order, ctx)) { ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB); @@ -454,6 +456,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, goto err; } } +#ifndef OPENSSL_NO_EC2M else /* NID_X9_62_characteristic_two_field */ { if (!EC_POINT_get_affine_coordinates_GF2m(group, @@ -463,7 +466,7 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len, goto err; } } - +#endif if (!BN_nnmod(u1, X, order, ctx)) { ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB); diff --git a/src/lib/libcrypto/engine/eng_all.c b/src/lib/libcrypto/engine/eng_all.c index 22c120454f..6093376df4 100644 --- a/src/lib/libcrypto/engine/eng_all.c +++ b/src/lib/libcrypto/engine/eng_all.c @@ -61,6 +61,8 @@ void ENGINE_load_builtin_engines(void) { + /* Some ENGINEs need this */ + OPENSSL_cpuid_setup(); #if 0 /* There's no longer any need for an "openssl" ENGINE unless, one day, * it is the *only* way for standard builtin implementations to be be @@ -70,6 +72,12 @@ void ENGINE_load_builtin_engines(void) #endif #if !defined(OPENSSL_NO_HW) && (defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV)) ENGINE_load_cryptodev(); +#endif +#ifndef OPENSSL_NO_RSAX + ENGINE_load_rsax(); +#endif +#ifndef OPENSSL_NO_RDRAND + ENGINE_load_rdrand(); #endif ENGINE_load_dynamic(); #ifndef OPENSSL_NO_STATIC_ENGINE @@ -112,6 +120,7 @@ void ENGINE_load_builtin_engines(void) ENGINE_load_capi(); #endif #endif + ENGINE_register_all_complete(); } #if defined(__OpenBSD__) || defined(__FreeBSD__) || defined(HAVE_CRYPTODEV) diff --git a/src/lib/libcrypto/engine/eng_fat.c b/src/lib/libcrypto/engine/eng_fat.c index db66e62350..789b8d57e5 100644 --- a/src/lib/libcrypto/engine/eng_fat.c +++ b/src/lib/libcrypto/engine/eng_fat.c @@ -176,6 +176,7 @@ int ENGINE_register_all_complete(void) ENGINE *e; for(e=ENGINE_get_first() ; e ; e=ENGINE_get_next(e)) - ENGINE_register_complete(e); + if (!(e->flags & ENGINE_FLAGS_NO_REGISTER_ALL)) + ENGINE_register_complete(e); return 1; } diff --git a/src/lib/libcrypto/engine/engine.h b/src/lib/libcrypto/engine/engine.h index 943aeae215..f8be497724 100644 --- a/src/lib/libcrypto/engine/engine.h +++ b/src/lib/libcrypto/engine/engine.h @@ -141,6 +141,13 @@ extern "C" { * the existing ENGINE's structural reference count. */ #define ENGINE_FLAGS_BY_ID_COPY (int)0x0004 +/* This flag if for an ENGINE that does not want its methods registered as + * part of ENGINE_register_all_complete() for example if the methods are + * not usable as default methods. + */ + +#define ENGINE_FLAGS_NO_REGISTER_ALL (int)0x0008 + /* ENGINEs can support their own command types, and these flags are used in * ENGINE_CTRL_GET_CMD_FLAGS to indicate to the caller what kind of input each * command expects. Currently only numeric and string input is supported. If a @@ -344,6 +351,8 @@ void ENGINE_load_gost(void); #endif #endif void ENGINE_load_cryptodev(void); +void ENGINE_load_rsax(void); +void ENGINE_load_rdrand(void); void ENGINE_load_builtin_engines(void); /* Get and set global flags (ENGINE_TABLE_FLAG_***) for the implementation diff --git a/src/lib/libcrypto/err/err.c b/src/lib/libcrypto/err/err.c index 69713a6e2f..fcdb244008 100644 --- a/src/lib/libcrypto/err/err.c +++ b/src/lib/libcrypto/err/err.c @@ -1066,6 +1066,13 @@ void ERR_set_error_data(char *data, int flags) void ERR_add_error_data(int num, ...) { va_list args; + va_start(args, num); + ERR_add_error_vdata(num, args); + va_end(args); + } + +void ERR_add_error_vdata(int num, va_list args) + { int i,n,s; char *str,*p,*a; @@ -1074,7 +1081,6 @@ void ERR_add_error_data(int num, ...) if (str == NULL) return; str[0]='\0'; - va_start(args, num); n=0; for (i=0; i +#ifdef OPENSSL_FIPS +#include +#endif + void ERR_load_crypto_strings(void) { #ifndef OPENSSL_NO_ERR @@ -156,5 +160,8 @@ void ERR_load_crypto_strings(void) ERR_load_JPAKE_strings(); #endif ERR_load_COMP_strings(); +#endif +#ifdef OPENSSL_FIPS + ERR_load_FIPS_strings(); #endif } diff --git a/src/lib/libcrypto/evp/bio_md.c b/src/lib/libcrypto/evp/bio_md.c index 9841e32e1a..144fdfd56a 100644 --- a/src/lib/libcrypto/evp/bio_md.c +++ b/src/lib/libcrypto/evp/bio_md.c @@ -153,8 +153,12 @@ static int md_write(BIO *b, const char *in, int inl) { if (ret > 0) { - EVP_DigestUpdate(ctx,(const unsigned char *)in, - (unsigned int)ret); + if (!EVP_DigestUpdate(ctx,(const unsigned char *)in, + (unsigned int)ret)) + { + BIO_clear_retry_flags(b); + return 0; + } } } if(b->next_bio != NULL) @@ -220,7 +224,8 @@ static long md_ctrl(BIO *b, int cmd, long num, void *ptr) case BIO_CTRL_DUP: dbio=ptr; dctx=dbio->ptr; - EVP_MD_CTX_copy_ex(dctx,ctx); + if (!EVP_MD_CTX_copy_ex(dctx,ctx)) + return 0; b->init=1; break; default: diff --git a/src/lib/libcrypto/evp/digest.c b/src/lib/libcrypto/evp/digest.c index 982ba2b136..467e6b5ae9 100644 --- a/src/lib/libcrypto/evp/digest.c +++ b/src/lib/libcrypto/evp/digest.c @@ -117,6 +117,10 @@ #include #endif +#ifdef OPENSSL_FIPS +#include +#endif + void EVP_MD_CTX_init(EVP_MD_CTX *ctx) { memset(ctx,'\0',sizeof *ctx); @@ -225,12 +229,26 @@ skip_to_init: } if (ctx->flags & EVP_MD_CTX_FLAG_NO_INIT) return 1; +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + { + if (FIPS_digestinit(ctx, type)) + return 1; + OPENSSL_free(ctx->md_data); + ctx->md_data = NULL; + return 0; + } +#endif return ctx->digest->init(ctx); } int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t count) { +#ifdef OPENSSL_FIPS + return FIPS_digestupdate(ctx, data, count); +#else return ctx->update(ctx,data,count); +#endif } /* The caller can assume that this removes any secret data from the context */ @@ -245,8 +263,10 @@ int EVP_DigestFinal(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size) /* The caller can assume that this removes any secret data from the context */ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size) { +#ifdef OPENSSL_FIPS + return FIPS_digestfinal(ctx, md, size); +#else int ret; - OPENSSL_assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE); ret=ctx->digest->final(ctx,md); if (size != NULL) @@ -258,6 +278,7 @@ int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, unsigned char *md, unsigned int *size) } memset(ctx->md_data,0,ctx->digest->ctx_size); return ret; +#endif } int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in) @@ -351,6 +372,7 @@ void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx) /* This call frees resources associated with the context */ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx) { +#ifndef OPENSSL_FIPS /* Don't assume ctx->md_data was cleaned in EVP_Digest_Final, * because sometimes only copies of the context are ever finalised. */ @@ -363,6 +385,7 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx) OPENSSL_cleanse(ctx->md_data,ctx->digest->ctx_size); OPENSSL_free(ctx->md_data); } +#endif if (ctx->pctx) EVP_PKEY_CTX_free(ctx->pctx); #ifndef OPENSSL_NO_ENGINE @@ -370,6 +393,9 @@ int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx) /* The EVP_MD we used belongs to an ENGINE, release the * functional reference we held for this reason. */ ENGINE_finish(ctx->engine); +#endif +#ifdef OPENSSL_FIPS + FIPS_md_ctx_cleanup(ctx); #endif memset(ctx,'\0',sizeof *ctx); diff --git a/src/lib/libcrypto/evp/e_aes.c b/src/lib/libcrypto/evp/e_aes.c index bd6c0a3a62..1e4af0cb75 100644 --- a/src/lib/libcrypto/evp/e_aes.c +++ b/src/lib/libcrypto/evp/e_aes.c @@ -1,5 +1,5 @@ /* ==================================================================== - * Copyright (c) 2001 The OpenSSL Project. All rights reserved. + * Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -56,57 +56,511 @@ #include #include #include "evp_locl.h" - -static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, - const unsigned char *iv, int enc); +#ifndef OPENSSL_FIPS +#include "modes_lcl.h" +#include typedef struct { AES_KEY ks; + block128_f block; + union { + cbc128_f cbc; + ctr128_f ctr; + } stream; } EVP_AES_KEY; -#define data(ctx) EVP_C_DATA(EVP_AES_KEY,ctx) - -IMPLEMENT_BLOCK_CIPHER(aes_128, ks, AES, EVP_AES_KEY, - NID_aes_128, 16, 16, 16, 128, - 0, aes_init_key, NULL, - EVP_CIPHER_set_asn1_iv, - EVP_CIPHER_get_asn1_iv, - NULL) -IMPLEMENT_BLOCK_CIPHER(aes_192, ks, AES, EVP_AES_KEY, - NID_aes_192, 16, 24, 16, 128, - 0, aes_init_key, NULL, - EVP_CIPHER_set_asn1_iv, - EVP_CIPHER_get_asn1_iv, - NULL) -IMPLEMENT_BLOCK_CIPHER(aes_256, ks, AES, EVP_AES_KEY, - NID_aes_256, 16, 32, 16, 128, - 0, aes_init_key, NULL, - EVP_CIPHER_set_asn1_iv, - EVP_CIPHER_get_asn1_iv, - NULL) - -#define IMPLEMENT_AES_CFBR(ksize,cbits) IMPLEMENT_CFBR(aes,AES,EVP_AES_KEY,ks,ksize,cbits,16) - -IMPLEMENT_AES_CFBR(128,1) -IMPLEMENT_AES_CFBR(192,1) -IMPLEMENT_AES_CFBR(256,1) - -IMPLEMENT_AES_CFBR(128,8) -IMPLEMENT_AES_CFBR(192,8) -IMPLEMENT_AES_CFBR(256,8) +typedef struct + { + AES_KEY ks; /* AES key schedule to use */ + int key_set; /* Set if key initialised */ + int iv_set; /* Set if an iv is set */ + GCM128_CONTEXT gcm; + unsigned char *iv; /* Temporary IV store */ + int ivlen; /* IV length */ + int taglen; + int iv_gen; /* It is OK to generate IVs */ + int tls_aad_len; /* TLS AAD length */ + ctr128_f ctr; + } EVP_AES_GCM_CTX; + +typedef struct + { + AES_KEY ks1, ks2; /* AES key schedules to use */ + XTS128_CONTEXT xts; + void (*stream)(const unsigned char *in, + unsigned char *out, size_t length, + const AES_KEY *key1, const AES_KEY *key2, + const unsigned char iv[16]); + } EVP_AES_XTS_CTX; + +typedef struct + { + AES_KEY ks; /* AES key schedule to use */ + int key_set; /* Set if key initialised */ + int iv_set; /* Set if an iv is set */ + int tag_set; /* Set if tag is valid */ + int len_set; /* Set if message length set */ + int L, M; /* L and M parameters from RFC3610 */ + CCM128_CONTEXT ccm; + ccm128_f str; + } EVP_AES_CCM_CTX; + +#define MAXBITCHUNK ((size_t)1<<(sizeof(size_t)*8-4)) + +#ifdef VPAES_ASM +int vpaes_set_encrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); +int vpaes_set_decrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); + +void vpaes_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void vpaes_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); + +void vpaes_cbc_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + unsigned char *ivec, int enc); +#endif +#ifdef BSAES_ASM +void bsaes_cbc_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, + unsigned char ivec[16], int enc); +void bsaes_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const AES_KEY *key, + const unsigned char ivec[16]); +void bsaes_xts_encrypt(const unsigned char *inp, unsigned char *out, + size_t len, const AES_KEY *key1, + const AES_KEY *key2, const unsigned char iv[16]); +void bsaes_xts_decrypt(const unsigned char *inp, unsigned char *out, + size_t len, const AES_KEY *key1, + const AES_KEY *key2, const unsigned char iv[16]); +#endif +#ifdef AES_CTR_ASM +void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out, + size_t blocks, const AES_KEY *key, + const unsigned char ivec[AES_BLOCK_SIZE]); +#endif +#ifdef AES_XTS_ASM +void AES_xts_encrypt(const char *inp,char *out,size_t len, + const AES_KEY *key1, const AES_KEY *key2, + const unsigned char iv[16]); +void AES_xts_decrypt(const char *inp,char *out,size_t len, + const AES_KEY *key1, const AES_KEY *key2, + const unsigned char iv[16]); +#endif + +#if defined(AES_ASM) && !defined(I386_ONLY) && ( \ + ((defined(__i386) || defined(__i386__) || \ + defined(_M_IX86)) && defined(OPENSSL_IA32_SSE2))|| \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(__INTEL__) ) + +extern unsigned int OPENSSL_ia32cap_P[2]; + +#ifdef VPAES_ASM +#define VPAES_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(41-32))) +#endif +#ifdef BSAES_ASM +#define BSAES_CAPABLE VPAES_CAPABLE +#endif +/* + * AES-NI section + */ +#define AESNI_CAPABLE (OPENSSL_ia32cap_P[1]&(1<<(57-32))) + +int aesni_set_encrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); +int aesni_set_decrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); + +void aesni_encrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); +void aesni_decrypt(const unsigned char *in, unsigned char *out, + const AES_KEY *key); + +void aesni_ecb_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + int enc); +void aesni_cbc_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + unsigned char *ivec, int enc); + +void aesni_ctr32_encrypt_blocks(const unsigned char *in, + unsigned char *out, + size_t blocks, + const void *key, + const unsigned char *ivec); + +void aesni_xts_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key1, const AES_KEY *key2, + const unsigned char iv[16]); + +void aesni_xts_decrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key1, const AES_KEY *key2, + const unsigned char iv[16]); + +void aesni_ccm64_encrypt_blocks (const unsigned char *in, + unsigned char *out, + size_t blocks, + const void *key, + const unsigned char ivec[16], + unsigned char cmac[16]); + +void aesni_ccm64_decrypt_blocks (const unsigned char *in, + unsigned char *out, + size_t blocks, + const void *key, + const unsigned char ivec[16], + unsigned char cmac[16]); + +static int aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + int ret, mode; + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + mode = ctx->cipher->flags & EVP_CIPH_MODE; + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) + && !enc) + { + ret = aesni_set_decrypt_key(key, ctx->key_len*8, ctx->cipher_data); + dat->block = (block128_f)aesni_decrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)aesni_cbc_encrypt : + NULL; + } + else { + ret = aesni_set_encrypt_key(key, ctx->key_len*8, ctx->cipher_data); + dat->block = (block128_f)aesni_encrypt; + if (mode==EVP_CIPH_CBC_MODE) + dat->stream.cbc = (cbc128_f)aesni_cbc_encrypt; + else if (mode==EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f)aesni_ctr32_encrypt_blocks; + else + dat->stream.cbc = NULL; + } + + if(ret < 0) + { + EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED); + return 0; + } + + return 1; + } + +static int aesni_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in, size_t len) +{ + aesni_cbc_encrypt(in,out,len,ctx->cipher_data,ctx->iv,ctx->encrypt); + + return 1; +} + +static int aesni_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in, size_t len) +{ + size_t bl = ctx->cipher->block_size; + + if (lencipher_data,ctx->encrypt); + + return 1; +} + +#define aesni_ofb_cipher aes_ofb_cipher +static int aesni_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len); + +#define aesni_cfb_cipher aes_cfb_cipher +static int aesni_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len); + +#define aesni_cfb8_cipher aes_cfb8_cipher +static int aesni_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len); + +#define aesni_cfb1_cipher aes_cfb1_cipher +static int aesni_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len); + +#define aesni_ctr_cipher aes_ctr_cipher +static int aesni_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len); + +static int aesni_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + EVP_AES_GCM_CTX *gctx = ctx->cipher_data; + if (!iv && !key) + return 1; + if (key) + { + aesni_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); + CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, + (block128_f)aesni_encrypt); + gctx->ctr = (ctr128_f)aesni_ctr32_encrypt_blocks; + /* If we have an iv can set it directly, otherwise use + * saved IV. + */ + if (iv == NULL && gctx->iv_set) + iv = gctx->iv; + if (iv) + { + CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen); + gctx->iv_set = 1; + } + gctx->key_set = 1; + } + else + { + /* If key set use IV, otherwise copy */ + if (gctx->key_set) + CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen); + else + memcpy(gctx->iv, iv, gctx->ivlen); + gctx->iv_set = 1; + gctx->iv_gen = 0; + } + return 1; + } + +#define aesni_gcm_cipher aes_gcm_cipher +static int aesni_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len); + +static int aesni_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + EVP_AES_XTS_CTX *xctx = ctx->cipher_data; + if (!iv && !key) + return 1; + + if (key) + { + /* key_len is two AES keys */ + if (enc) + { + aesni_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)aesni_encrypt; + xctx->stream = aesni_xts_encrypt; + } + else + { + aesni_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)aesni_decrypt; + xctx->stream = aesni_xts_decrypt; + } + + aesni_set_encrypt_key(key + ctx->key_len/2, + ctx->key_len * 4, &xctx->ks2); + xctx->xts.block2 = (block128_f)aesni_encrypt; + + xctx->xts.key1 = &xctx->ks1; + } + + if (iv) + { + xctx->xts.key2 = &xctx->ks2; + memcpy(ctx->iv, iv, 16); + } + + return 1; + } + +#define aesni_xts_cipher aes_xts_cipher +static int aesni_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len); + +static int aesni_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + EVP_AES_CCM_CTX *cctx = ctx->cipher_data; + if (!iv && !key) + return 1; + if (key) + { + aesni_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)aesni_encrypt); + cctx->str = enc?(ccm128_f)aesni_ccm64_encrypt_blocks : + (ccm128_f)aesni_ccm64_decrypt_blocks; + cctx->key_set = 1; + } + if (iv) + { + memcpy(ctx->iv, iv, 15 - cctx->L); + cctx->iv_set = 1; + } + return 1; + } + +#define aesni_ccm_cipher aes_ccm_cipher +static int aesni_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len); + +#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \ +static const EVP_CIPHER aesni_##keylen##_##mode = { \ + nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + aesni_init_key, \ + aesni_##mode##_cipher, \ + NULL, \ + sizeof(EVP_AES_KEY), \ + NULL,NULL,NULL,NULL }; \ +static const EVP_CIPHER aes_##keylen##_##mode = { \ + nid##_##keylen##_##nmode,blocksize, \ + keylen/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + aes_init_key, \ + aes_##mode##_cipher, \ + NULL, \ + sizeof(EVP_AES_KEY), \ + NULL,NULL,NULL,NULL }; \ +const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ +{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; } + +#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \ +static const EVP_CIPHER aesni_##keylen##_##mode = { \ + nid##_##keylen##_##mode,blocksize, \ + (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + aesni_##mode##_init_key, \ + aesni_##mode##_cipher, \ + aes_##mode##_cleanup, \ + sizeof(EVP_AES_##MODE##_CTX), \ + NULL,NULL,aes_##mode##_ctrl,NULL }; \ +static const EVP_CIPHER aes_##keylen##_##mode = { \ + nid##_##keylen##_##mode,blocksize, \ + (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + aes_##mode##_init_key, \ + aes_##mode##_cipher, \ + aes_##mode##_cleanup, \ + sizeof(EVP_AES_##MODE##_CTX), \ + NULL,NULL,aes_##mode##_ctrl,NULL }; \ +const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ +{ return AESNI_CAPABLE?&aesni_##keylen##_##mode:&aes_##keylen##_##mode; } + +#else + +#define BLOCK_CIPHER_generic(nid,keylen,blocksize,ivlen,nmode,mode,MODE,flags) \ +static const EVP_CIPHER aes_##keylen##_##mode = { \ + nid##_##keylen##_##nmode,blocksize,keylen/8,ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + aes_init_key, \ + aes_##mode##_cipher, \ + NULL, \ + sizeof(EVP_AES_KEY), \ + NULL,NULL,NULL,NULL }; \ +const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ +{ return &aes_##keylen##_##mode; } + +#define BLOCK_CIPHER_custom(nid,keylen,blocksize,ivlen,mode,MODE,flags) \ +static const EVP_CIPHER aes_##keylen##_##mode = { \ + nid##_##keylen##_##mode,blocksize, \ + (EVP_CIPH_##MODE##_MODE==EVP_CIPH_XTS_MODE?2:1)*keylen/8, ivlen, \ + flags|EVP_CIPH_##MODE##_MODE, \ + aes_##mode##_init_key, \ + aes_##mode##_cipher, \ + aes_##mode##_cleanup, \ + sizeof(EVP_AES_##MODE##_CTX), \ + NULL,NULL,aes_##mode##_ctrl,NULL }; \ +const EVP_CIPHER *EVP_aes_##keylen##_##mode(void) \ +{ return &aes_##keylen##_##mode; } +#endif + +#define BLOCK_CIPHER_generic_pack(nid,keylen,flags) \ + BLOCK_CIPHER_generic(nid,keylen,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,keylen,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,keylen,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,keylen,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ + BLOCK_CIPHER_generic(nid,keylen,1,16,cfb1,cfb1,CFB,flags) \ + BLOCK_CIPHER_generic(nid,keylen,1,16,cfb8,cfb8,CFB,flags) \ + BLOCK_CIPHER_generic(nid,keylen,1,16,ctr,ctr,CTR,flags) static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv, int enc) { - int ret; + int ret, mode; + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; - if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE - || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE - || enc) - ret=AES_set_encrypt_key(key, ctx->key_len * 8, ctx->cipher_data); + mode = ctx->cipher->flags & EVP_CIPH_MODE; + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) + && !enc) +#ifdef BSAES_CAPABLE + if (BSAES_CAPABLE && mode==EVP_CIPH_CBC_MODE) + { + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)AES_decrypt; + dat->stream.cbc = (cbc128_f)bsaes_cbc_encrypt; + } + else +#endif +#ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { + ret = vpaes_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)vpaes_decrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)vpaes_cbc_encrypt : + NULL; + } + else +#endif + { + ret = AES_set_decrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)AES_decrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)AES_cbc_encrypt : + NULL; + } else - ret=AES_set_decrypt_key(key, ctx->key_len * 8, ctx->cipher_data); +#ifdef BSAES_CAPABLE + if (BSAES_CAPABLE && mode==EVP_CIPH_CTR_MODE) + { + ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)AES_encrypt; + dat->stream.ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; + } + else +#endif +#ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { + ret = vpaes_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)vpaes_encrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)vpaes_cbc_encrypt : + NULL; + } + else +#endif + { + ret = AES_set_encrypt_key(key,ctx->key_len*8,&dat->ks); + dat->block = (block128_f)AES_encrypt; + dat->stream.cbc = mode==EVP_CIPH_CBC_MODE ? + (cbc128_f)AES_cbc_encrypt : + NULL; +#ifdef AES_CTR_ASM + if (mode==EVP_CIPH_CTR_MODE) + dat->stream.ctr = (ctr128_f)AES_ctr32_encrypt; +#endif + } if(ret < 0) { @@ -117,4 +571,743 @@ static int aes_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, return 1; } +static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in, size_t len) +{ + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + if (dat->stream.cbc) + (*dat->stream.cbc)(in,out,len,&dat->ks,ctx->iv,ctx->encrypt); + else if (ctx->encrypt) + CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block); + else + CRYPTO_cbc128_encrypt(in,out,len,&dat->ks,ctx->iv,dat->block); + + return 1; +} + +static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in, size_t len) +{ + size_t bl = ctx->cipher->block_size; + size_t i; + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + if (lenblock)(in+i,out+i,&dat->ks); + + return 1; +} + +static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len) +{ + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + CRYPTO_ofb128_encrypt(in,out,len,&dat->ks, + ctx->iv,&ctx->num,dat->block); + return 1; +} + +static int aes_cfb_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len) +{ + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + CRYPTO_cfb128_encrypt(in,out,len,&dat->ks, + ctx->iv,&ctx->num,ctx->encrypt,dat->block); + return 1; +} + +static int aes_cfb8_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len) +{ + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + CRYPTO_cfb128_8_encrypt(in,out,len,&dat->ks, + ctx->iv,&ctx->num,ctx->encrypt,dat->block); + return 1; +} + +static int aes_cfb1_cipher(EVP_CIPHER_CTX *ctx,unsigned char *out, + const unsigned char *in,size_t len) +{ + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + if (ctx->flags&EVP_CIPH_FLAG_LENGTH_BITS) { + CRYPTO_cfb128_1_encrypt(in,out,len,&dat->ks, + ctx->iv,&ctx->num,ctx->encrypt,dat->block); + return 1; + } + + while (len>=MAXBITCHUNK) { + CRYPTO_cfb128_1_encrypt(in,out,MAXBITCHUNK*8,&dat->ks, + ctx->iv,&ctx->num,ctx->encrypt,dat->block); + len-=MAXBITCHUNK; + } + if (len) + CRYPTO_cfb128_1_encrypt(in,out,len*8,&dat->ks, + ctx->iv,&ctx->num,ctx->encrypt,dat->block); + + return 1; +} + +static int aes_ctr_cipher (EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) +{ + unsigned int num = ctx->num; + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + if (dat->stream.ctr) + CRYPTO_ctr128_encrypt_ctr32(in,out,len,&dat->ks, + ctx->iv,ctx->buf,&num,dat->stream.ctr); + else + CRYPTO_ctr128_encrypt(in,out,len,&dat->ks, + ctx->iv,ctx->buf,&num,dat->block); + ctx->num = (size_t)num; + return 1; +} + +BLOCK_CIPHER_generic_pack(NID_aes,128,EVP_CIPH_FLAG_FIPS) +BLOCK_CIPHER_generic_pack(NID_aes,192,EVP_CIPH_FLAG_FIPS) +BLOCK_CIPHER_generic_pack(NID_aes,256,EVP_CIPH_FLAG_FIPS) + +static int aes_gcm_cleanup(EVP_CIPHER_CTX *c) + { + EVP_AES_GCM_CTX *gctx = c->cipher_data; + OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm)); + if (gctx->iv != c->iv) + OPENSSL_free(gctx->iv); + return 1; + } + +/* increment counter (64-bit int) by 1 */ +static void ctr64_inc(unsigned char *counter) { + int n=8; + unsigned char c; + + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) return; + } while (n); +} + +static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) + { + EVP_AES_GCM_CTX *gctx = c->cipher_data; + switch (type) + { + case EVP_CTRL_INIT: + gctx->key_set = 0; + gctx->iv_set = 0; + gctx->ivlen = c->cipher->iv_len; + gctx->iv = c->iv; + gctx->taglen = -1; + gctx->iv_gen = 0; + gctx->tls_aad_len = -1; + return 1; + + case EVP_CTRL_GCM_SET_IVLEN: + if (arg <= 0) + return 0; +#ifdef OPENSSL_FIPS + if (FIPS_module_mode() && !(c->flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) + && arg < 12) + return 0; +#endif + /* Allocate memory for IV if needed */ + if ((arg > EVP_MAX_IV_LENGTH) && (arg > gctx->ivlen)) + { + if (gctx->iv != c->iv) + OPENSSL_free(gctx->iv); + gctx->iv = OPENSSL_malloc(arg); + if (!gctx->iv) + return 0; + } + gctx->ivlen = arg; + return 1; + + case EVP_CTRL_GCM_SET_TAG: + if (arg <= 0 || arg > 16 || c->encrypt) + return 0; + memcpy(c->buf, ptr, arg); + gctx->taglen = arg; + return 1; + + case EVP_CTRL_GCM_GET_TAG: + if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0) + return 0; + memcpy(ptr, c->buf, arg); + return 1; + + case EVP_CTRL_GCM_SET_IV_FIXED: + /* Special case: -1 length restores whole IV */ + if (arg == -1) + { + memcpy(gctx->iv, ptr, gctx->ivlen); + gctx->iv_gen = 1; + return 1; + } + /* Fixed field must be at least 4 bytes and invocation field + * at least 8. + */ + if ((arg < 4) || (gctx->ivlen - arg) < 8) + return 0; + if (arg) + memcpy(gctx->iv, ptr, arg); + if (c->encrypt && + RAND_bytes(gctx->iv + arg, gctx->ivlen - arg) <= 0) + return 0; + gctx->iv_gen = 1; + return 1; + + case EVP_CTRL_GCM_IV_GEN: + if (gctx->iv_gen == 0 || gctx->key_set == 0) + return 0; + CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen); + if (arg <= 0 || arg > gctx->ivlen) + arg = gctx->ivlen; + memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg); + /* Invocation field will be at least 8 bytes in size and + * so no need to check wrap around or increment more than + * last 8 bytes. + */ + ctr64_inc(gctx->iv + gctx->ivlen - 8); + gctx->iv_set = 1; + return 1; + + case EVP_CTRL_GCM_SET_IV_INV: + if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt) + return 0; + memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg); + CRYPTO_gcm128_setiv(&gctx->gcm, gctx->iv, gctx->ivlen); + gctx->iv_set = 1; + return 1; + + case EVP_CTRL_AEAD_TLS1_AAD: + /* Save the AAD for later use */ + if (arg != 13) + return 0; + memcpy(c->buf, ptr, arg); + gctx->tls_aad_len = arg; + { + unsigned int len=c->buf[arg-2]<<8|c->buf[arg-1]; + /* Correct length for explicit IV */ + len -= EVP_GCM_TLS_EXPLICIT_IV_LEN; + /* If decrypting correct for tag too */ + if (!c->encrypt) + len -= EVP_GCM_TLS_TAG_LEN; + c->buf[arg-2] = len>>8; + c->buf[arg-1] = len & 0xff; + } + /* Extra padding: tag appended to record */ + return EVP_GCM_TLS_TAG_LEN; + + default: + return -1; + + } + } + +static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + EVP_AES_GCM_CTX *gctx = ctx->cipher_data; + if (!iv && !key) + return 1; + if (key) + { do { +#ifdef BSAES_CAPABLE + if (BSAES_CAPABLE) + { + AES_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, + (block128_f)AES_encrypt); + gctx->ctr = (ctr128_f)bsaes_ctr32_encrypt_blocks; + break; + } + else +#endif +#ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { + vpaes_set_encrypt_key(key,ctx->key_len*8,&gctx->ks); + CRYPTO_gcm128_init(&gctx->gcm,&gctx->ks, + (block128_f)vpaes_encrypt); + gctx->ctr = NULL; + break; + } +#endif + AES_set_encrypt_key(key, ctx->key_len * 8, &gctx->ks); + CRYPTO_gcm128_init(&gctx->gcm, &gctx->ks, (block128_f)AES_encrypt); +#ifdef AES_CTR_ASM + gctx->ctr = (ctr128_f)AES_ctr32_encrypt; +#else + gctx->ctr = NULL; +#endif + } while (0); + + /* If we have an iv can set it directly, otherwise use + * saved IV. + */ + if (iv == NULL && gctx->iv_set) + iv = gctx->iv; + if (iv) + { + CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen); + gctx->iv_set = 1; + } + gctx->key_set = 1; + } + else + { + /* If key set use IV, otherwise copy */ + if (gctx->key_set) + CRYPTO_gcm128_setiv(&gctx->gcm, iv, gctx->ivlen); + else + memcpy(gctx->iv, iv, gctx->ivlen); + gctx->iv_set = 1; + gctx->iv_gen = 0; + } + return 1; + } + +/* Handle TLS GCM packet format. This consists of the last portion of the IV + * followed by the payload and finally the tag. On encrypt generate IV, + * encrypt payload and write the tag. On verify retrieve IV, decrypt payload + * and verify tag. + */ + +static int aes_gcm_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_AES_GCM_CTX *gctx = ctx->cipher_data; + int rv = -1; + /* Encrypt/decrypt must be performed in place */ + if (out != in || len < (EVP_GCM_TLS_EXPLICIT_IV_LEN+EVP_GCM_TLS_TAG_LEN)) + return -1; + /* Set IV from start of buffer or generate IV and write to start + * of buffer. + */ + if (EVP_CIPHER_CTX_ctrl(ctx, ctx->encrypt ? + EVP_CTRL_GCM_IV_GEN : EVP_CTRL_GCM_SET_IV_INV, + EVP_GCM_TLS_EXPLICIT_IV_LEN, out) <= 0) + goto err; + /* Use saved AAD */ + if (CRYPTO_gcm128_aad(&gctx->gcm, ctx->buf, gctx->tls_aad_len)) + goto err; + /* Fix buffer and length to point to payload */ + in += EVP_GCM_TLS_EXPLICIT_IV_LEN; + out += EVP_GCM_TLS_EXPLICIT_IV_LEN; + len -= EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN; + if (ctx->encrypt) + { + /* Encrypt payload */ + if (gctx->ctr) + { + if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, + in, out, len, + gctx->ctr)) + goto err; + } + else { + if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len)) + goto err; + } + out += len; + /* Finally write tag */ + CRYPTO_gcm128_tag(&gctx->gcm, out, EVP_GCM_TLS_TAG_LEN); + rv = len + EVP_GCM_TLS_EXPLICIT_IV_LEN + EVP_GCM_TLS_TAG_LEN; + } + else + { + /* Decrypt */ + if (gctx->ctr) + { + if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, + in, out, len, + gctx->ctr)) + goto err; + } + else { + if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len)) + goto err; + } + /* Retrieve tag */ + CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, + EVP_GCM_TLS_TAG_LEN); + /* If tag mismatch wipe buffer */ + if (memcmp(ctx->buf, in + len, EVP_GCM_TLS_TAG_LEN)) + { + OPENSSL_cleanse(out, len); + goto err; + } + rv = len; + } + + err: + gctx->iv_set = 0; + gctx->tls_aad_len = -1; + return rv; + } + +static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_AES_GCM_CTX *gctx = ctx->cipher_data; + /* If not set up, return error */ + if (!gctx->key_set) + return -1; + + if (gctx->tls_aad_len >= 0) + return aes_gcm_tls_cipher(ctx, out, in, len); + + if (!gctx->iv_set) + return -1; + if (!ctx->encrypt && gctx->taglen < 0) + return -1; + if (in) + { + if (out == NULL) + { + if (CRYPTO_gcm128_aad(&gctx->gcm, in, len)) + return -1; + } + else if (ctx->encrypt) + { + if (gctx->ctr) + { + if (CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, + in, out, len, + gctx->ctr)) + return -1; + } + else { + if (CRYPTO_gcm128_encrypt(&gctx->gcm, in, out, len)) + return -1; + } + } + else + { + if (gctx->ctr) + { + if (CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, + in, out, len, + gctx->ctr)) + return -1; + } + else { + if (CRYPTO_gcm128_decrypt(&gctx->gcm, in, out, len)) + return -1; + } + } + return len; + } + else + { + if (!ctx->encrypt) + { + if (CRYPTO_gcm128_finish(&gctx->gcm, + ctx->buf, gctx->taglen) != 0) + return -1; + gctx->iv_set = 0; + return 0; + } + CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16); + gctx->taglen = 16; + /* Don't reuse the IV */ + gctx->iv_set = 0; + return 0; + } + + } + +#define CUSTOM_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 \ + | EVP_CIPH_CUSTOM_IV | EVP_CIPH_FLAG_CUSTOM_CIPHER \ + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT) + +BLOCK_CIPHER_custom(NID_aes,128,1,12,gcm,GCM, + EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS) +BLOCK_CIPHER_custom(NID_aes,192,1,12,gcm,GCM, + EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS) +BLOCK_CIPHER_custom(NID_aes,256,1,12,gcm,GCM, + EVP_CIPH_FLAG_FIPS|EVP_CIPH_FLAG_AEAD_CIPHER|CUSTOM_FLAGS) + +static int aes_xts_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) + { + EVP_AES_XTS_CTX *xctx = c->cipher_data; + if (type != EVP_CTRL_INIT) + return -1; + /* key1 and key2 are used as an indicator both key and IV are set */ + xctx->xts.key1 = NULL; + xctx->xts.key2 = NULL; + return 1; + } + +static int aes_xts_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + EVP_AES_XTS_CTX *xctx = ctx->cipher_data; + if (!iv && !key) + return 1; + + if (key) do + { +#ifdef AES_XTS_ASM + xctx->stream = enc ? AES_xts_encrypt : AES_xts_decrypt; +#else + xctx->stream = NULL; +#endif + /* key_len is two AES keys */ +#ifdef BSAES_CAPABLE + if (BSAES_CAPABLE) + xctx->stream = enc ? bsaes_xts_encrypt : bsaes_xts_decrypt; + else +#endif +#ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { + if (enc) + { + vpaes_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)vpaes_encrypt; + } + else + { + vpaes_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)vpaes_decrypt; + } + + vpaes_set_encrypt_key(key + ctx->key_len/2, + ctx->key_len * 4, &xctx->ks2); + xctx->xts.block2 = (block128_f)vpaes_encrypt; + + xctx->xts.key1 = &xctx->ks1; + break; + } +#endif + if (enc) + { + AES_set_encrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)AES_encrypt; + } + else + { + AES_set_decrypt_key(key, ctx->key_len * 4, &xctx->ks1); + xctx->xts.block1 = (block128_f)AES_decrypt; + } + + AES_set_encrypt_key(key + ctx->key_len/2, + ctx->key_len * 4, &xctx->ks2); + xctx->xts.block2 = (block128_f)AES_encrypt; + + xctx->xts.key1 = &xctx->ks1; + } while (0); + + if (iv) + { + xctx->xts.key2 = &xctx->ks2; + memcpy(ctx->iv, iv, 16); + } + + return 1; + } + +static int aes_xts_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_AES_XTS_CTX *xctx = ctx->cipher_data; + if (!xctx->xts.key1 || !xctx->xts.key2) + return 0; + if (!out || !in || lenflags & EVP_CIPH_FLAG_NON_FIPS_ALLOW) && + (len > (1UL<<20)*16)) + { + EVPerr(EVP_F_AES_XTS_CIPHER, EVP_R_TOO_LARGE); + return 0; + } +#endif + if (xctx->stream) + (*xctx->stream)(in, out, len, + xctx->xts.key1, xctx->xts.key2, ctx->iv); + else if (CRYPTO_xts128_encrypt(&xctx->xts, ctx->iv, in, out, len, + ctx->encrypt)) + return 0; + return 1; + } + +#define aes_xts_cleanup NULL + +#define XTS_FLAGS (EVP_CIPH_FLAG_DEFAULT_ASN1 | EVP_CIPH_CUSTOM_IV \ + | EVP_CIPH_ALWAYS_CALL_INIT | EVP_CIPH_CTRL_INIT) + +BLOCK_CIPHER_custom(NID_aes,128,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS) +BLOCK_CIPHER_custom(NID_aes,256,1,16,xts,XTS,EVP_CIPH_FLAG_FIPS|XTS_FLAGS) + +static int aes_ccm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) + { + EVP_AES_CCM_CTX *cctx = c->cipher_data; + switch (type) + { + case EVP_CTRL_INIT: + cctx->key_set = 0; + cctx->iv_set = 0; + cctx->L = 8; + cctx->M = 12; + cctx->tag_set = 0; + cctx->len_set = 0; + return 1; + + case EVP_CTRL_CCM_SET_IVLEN: + arg = 15 - arg; + case EVP_CTRL_CCM_SET_L: + if (arg < 2 || arg > 8) + return 0; + cctx->L = arg; + return 1; + + case EVP_CTRL_CCM_SET_TAG: + if ((arg & 1) || arg < 4 || arg > 16) + return 0; + if ((c->encrypt && ptr) || (!c->encrypt && !ptr)) + return 0; + if (ptr) + { + cctx->tag_set = 1; + memcpy(c->buf, ptr, arg); + } + cctx->M = arg; + return 1; + + case EVP_CTRL_CCM_GET_TAG: + if (!c->encrypt || !cctx->tag_set) + return 0; + if(!CRYPTO_ccm128_tag(&cctx->ccm, ptr, (size_t)arg)) + return 0; + cctx->tag_set = 0; + cctx->iv_set = 0; + cctx->len_set = 0; + return 1; + + default: + return -1; + + } + } + +static int aes_ccm_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { + EVP_AES_CCM_CTX *cctx = ctx->cipher_data; + if (!iv && !key) + return 1; + if (key) do + { +#ifdef VPAES_CAPABLE + if (VPAES_CAPABLE) + { + vpaes_set_encrypt_key(key, ctx->key_len*8, &cctx->ks); + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)vpaes_encrypt); + cctx->key_set = 1; + break; + } +#endif + AES_set_encrypt_key(key, ctx->key_len * 8, &cctx->ks); + CRYPTO_ccm128_init(&cctx->ccm, cctx->M, cctx->L, + &cctx->ks, (block128_f)AES_encrypt); + cctx->str = NULL; + cctx->key_set = 1; + } while (0); + if (iv) + { + memcpy(ctx->iv, iv, 15 - cctx->L); + cctx->iv_set = 1; + } + return 1; + } + +static int aes_ccm_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_AES_CCM_CTX *cctx = ctx->cipher_data; + CCM128_CONTEXT *ccm = &cctx->ccm; + /* If not set up, return error */ + if (!cctx->iv_set && !cctx->key_set) + return -1; + if (!ctx->encrypt && !cctx->tag_set) + return -1; + if (!out) + { + if (!in) + { + if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L,len)) + return -1; + cctx->len_set = 1; + return len; + } + /* If have AAD need message length */ + if (!cctx->len_set && len) + return -1; + CRYPTO_ccm128_aad(ccm, in, len); + return len; + } + /* EVP_*Final() doesn't return any data */ + if (!in) + return 0; + /* If not set length yet do it */ + if (!cctx->len_set) + { + if (CRYPTO_ccm128_setiv(ccm, ctx->iv, 15 - cctx->L, len)) + return -1; + cctx->len_set = 1; + } + if (ctx->encrypt) + { + if (cctx->str ? CRYPTO_ccm128_encrypt_ccm64(ccm, in, out, len, + cctx->str) : + CRYPTO_ccm128_encrypt(ccm, in, out, len)) + return -1; + cctx->tag_set = 1; + return len; + } + else + { + int rv = -1; + if (cctx->str ? !CRYPTO_ccm128_decrypt_ccm64(ccm, in, out, len, + cctx->str) : + !CRYPTO_ccm128_decrypt(ccm, in, out, len)) + { + unsigned char tag[16]; + if (CRYPTO_ccm128_tag(ccm, tag, cctx->M)) + { + if (!memcmp(tag, ctx->buf, cctx->M)) + rv = len; + } + } + if (rv == -1) + OPENSSL_cleanse(out, len); + cctx->iv_set = 0; + cctx->tag_set = 0; + cctx->len_set = 0; + return rv; + } + + } + +#define aes_ccm_cleanup NULL + +BLOCK_CIPHER_custom(NID_aes,128,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS) +BLOCK_CIPHER_custom(NID_aes,192,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS) +BLOCK_CIPHER_custom(NID_aes,256,1,12,ccm,CCM,EVP_CIPH_FLAG_FIPS|CUSTOM_FLAGS) + +#endif #endif diff --git a/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c new file mode 100644 index 0000000000..710fb79baf --- /dev/null +++ b/src/lib/libcrypto/evp/e_aes_cbc_hmac_sha1.c @@ -0,0 +1,406 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include + +#include +#include + +#if !defined(OPENSSL_NO_AES) && !defined(OPENSSL_NO_SHA1) + +#include +#include +#include +#include +#include "evp_locl.h" + +#ifndef EVP_CIPH_FLAG_AEAD_CIPHER +#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 +#define EVP_CTRL_AEAD_TLS1_AAD 0x16 +#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 +#endif + +#if !defined(EVP_CIPH_FLAG_DEFAULT_ASN1) +#define EVP_CIPH_FLAG_DEFAULT_ASN1 0 +#endif + +#define TLS1_1_VERSION 0x0302 + +typedef struct + { + AES_KEY ks; + SHA_CTX head,tail,md; + size_t payload_length; /* AAD length in decrypt case */ + union { + unsigned int tls_ver; + unsigned char tls_aad[16]; /* 13 used */ + } aux; + } EVP_AES_HMAC_SHA1; + +#define NO_PAYLOAD_LENGTH ((size_t)-1) + +#if defined(AES_ASM) && ( \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(__INTEL__) ) + +extern unsigned int OPENSSL_ia32cap_P[2]; +#define AESNI_CAPABLE (1<<(57-32)) + +int aesni_set_encrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); +int aesni_set_decrypt_key(const unsigned char *userKey, int bits, + AES_KEY *key); + +void aesni_cbc_encrypt(const unsigned char *in, + unsigned char *out, + size_t length, + const AES_KEY *key, + unsigned char *ivec, int enc); + +void aesni_cbc_sha1_enc (const void *inp, void *out, size_t blocks, + const AES_KEY *key, unsigned char iv[16], + SHA_CTX *ctx,const void *in0); + +#define data(ctx) ((EVP_AES_HMAC_SHA1 *)(ctx)->cipher_data) + +static int aesni_cbc_hmac_sha1_init_key(EVP_CIPHER_CTX *ctx, + const unsigned char *inkey, + const unsigned char *iv, int enc) + { + EVP_AES_HMAC_SHA1 *key = data(ctx); + int ret; + + if (enc) + ret=aesni_set_encrypt_key(inkey,ctx->key_len*8,&key->ks); + else + ret=aesni_set_decrypt_key(inkey,ctx->key_len*8,&key->ks); + + SHA1_Init(&key->head); /* handy when benchmarking */ + key->tail = key->head; + key->md = key->head; + + key->payload_length = NO_PAYLOAD_LENGTH; + + return ret<0?0:1; + } + +#define STITCHED_CALL + +#if !defined(STITCHED_CALL) +#define aes_off 0 +#endif + +void sha1_block_data_order (void *c,const void *p,size_t len); + +static void sha1_update(SHA_CTX *c,const void *data,size_t len) +{ const unsigned char *ptr = data; + size_t res; + + if ((res = c->num)) { + res = SHA_CBLOCK-res; + if (lenNh += len>>29; + c->Nl += len<<=3; + if (c->Nl<(unsigned int)len) c->Nh++; + } + + if (res) + SHA1_Update(c,ptr,res); +} + +#define SHA1_Update sha1_update + +static int aesni_cbc_hmac_sha1_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_AES_HMAC_SHA1 *key = data(ctx); + unsigned int l; + size_t plen = key->payload_length, + iv = 0, /* explicit IV in TLS 1.1 and later */ + sha_off = 0; +#if defined(STITCHED_CALL) + size_t aes_off = 0, + blocks; + + sha_off = SHA_CBLOCK-key->md.num; +#endif + + if (len%AES_BLOCK_SIZE) return 0; + + if (ctx->encrypt) { + if (plen==NO_PAYLOAD_LENGTH) + plen = len; + else if (len!=((plen+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE)) + return 0; + else if (key->aux.tls_ver >= TLS1_1_VERSION) + iv = AES_BLOCK_SIZE; + +#if defined(STITCHED_CALL) + if (plen>(sha_off+iv) && (blocks=(plen-(sha_off+iv))/SHA_CBLOCK)) { + SHA1_Update(&key->md,in+iv,sha_off); + + aesni_cbc_sha1_enc(in,out,blocks,&key->ks, + ctx->iv,&key->md,in+iv+sha_off); + blocks *= SHA_CBLOCK; + aes_off += blocks; + sha_off += blocks; + key->md.Nh += blocks>>29; + key->md.Nl += blocks<<=3; + if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; + } else { + sha_off = 0; + } +#endif + sha_off += iv; + SHA1_Update(&key->md,in+sha_off,plen-sha_off); + + if (plen!=len) { /* "TLS" mode of operation */ + if (in!=out) + memcpy(out+aes_off,in+aes_off,plen-aes_off); + + /* calculate HMAC and append it to payload */ + SHA1_Final(out+plen,&key->md); + key->md = key->tail; + SHA1_Update(&key->md,out+plen,SHA_DIGEST_LENGTH); + SHA1_Final(out+plen,&key->md); + + /* pad the payload|hmac */ + plen += SHA_DIGEST_LENGTH; + for (l=len-plen-1;plenks,ctx->iv,1); + } else { + aesni_cbc_encrypt(in+aes_off,out+aes_off,len-aes_off, + &key->ks,ctx->iv,1); + } + } else { + unsigned char mac[SHA_DIGEST_LENGTH]; + + /* decrypt HMAC|padding at once */ + aesni_cbc_encrypt(in,out,len, + &key->ks,ctx->iv,0); + + if (plen) { /* "TLS" mode of operation */ + /* figure out payload length */ + if (len<(size_t)(out[len-1]+1+SHA_DIGEST_LENGTH)) + return 0; + + len -= (out[len-1]+1+SHA_DIGEST_LENGTH); + + if ((key->aux.tls_aad[plen-4]<<8|key->aux.tls_aad[plen-3]) + >= TLS1_1_VERSION) { + len -= AES_BLOCK_SIZE; + iv = AES_BLOCK_SIZE; + } + + key->aux.tls_aad[plen-2] = len>>8; + key->aux.tls_aad[plen-1] = len; + + /* calculate HMAC and verify it */ + key->md = key->head; + SHA1_Update(&key->md,key->aux.tls_aad,plen); + SHA1_Update(&key->md,out+iv,len); + SHA1_Final(mac,&key->md); + + key->md = key->tail; + SHA1_Update(&key->md,mac,SHA_DIGEST_LENGTH); + SHA1_Final(mac,&key->md); + + if (memcmp(out+iv+len,mac,SHA_DIGEST_LENGTH)) + return 0; + } else { + SHA1_Update(&key->md,out,len); + } + } + + key->payload_length = NO_PAYLOAD_LENGTH; + + return 1; + } + +static int aesni_cbc_hmac_sha1_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) + { + EVP_AES_HMAC_SHA1 *key = data(ctx); + + switch (type) + { + case EVP_CTRL_AEAD_SET_MAC_KEY: + { + unsigned int i; + unsigned char hmac_key[64]; + + memset (hmac_key,0,sizeof(hmac_key)); + + if (arg > (int)sizeof(hmac_key)) { + SHA1_Init(&key->head); + SHA1_Update(&key->head,ptr,arg); + SHA1_Final(hmac_key,&key->head); + } else { + memcpy(hmac_key,ptr,arg); + } + + for (i=0;ihead); + SHA1_Update(&key->head,hmac_key,sizeof(hmac_key)); + + for (i=0;itail); + SHA1_Update(&key->tail,hmac_key,sizeof(hmac_key)); + + return 1; + } + case EVP_CTRL_AEAD_TLS1_AAD: + { + unsigned char *p=ptr; + unsigned int len=p[arg-2]<<8|p[arg-1]; + + if (ctx->encrypt) + { + key->payload_length = len; + if ((key->aux.tls_ver=p[arg-4]<<8|p[arg-3]) >= TLS1_1_VERSION) { + len -= AES_BLOCK_SIZE; + p[arg-2] = len>>8; + p[arg-1] = len; + } + key->md = key->head; + SHA1_Update(&key->md,p,arg); + + return (int)(((len+SHA_DIGEST_LENGTH+AES_BLOCK_SIZE)&-AES_BLOCK_SIZE) + - len); + } + else + { + if (arg>13) arg = 13; + memcpy(key->aux.tls_aad,ptr,arg); + key->payload_length = arg; + + return SHA_DIGEST_LENGTH; + } + } + default: + return -1; + } + } + +static EVP_CIPHER aesni_128_cbc_hmac_sha1_cipher = + { +#ifdef NID_aes_128_cbc_hmac_sha1 + NID_aes_128_cbc_hmac_sha1, +#else + NID_undef, +#endif + 16,16,16, + EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, + aesni_cbc_hmac_sha1_init_key, + aesni_cbc_hmac_sha1_cipher, + NULL, + sizeof(EVP_AES_HMAC_SHA1), + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, + aesni_cbc_hmac_sha1_ctrl, + NULL + }; + +static EVP_CIPHER aesni_256_cbc_hmac_sha1_cipher = + { +#ifdef NID_aes_256_cbc_hmac_sha1 + NID_aes_256_cbc_hmac_sha1, +#else + NID_undef, +#endif + 16,32,16, + EVP_CIPH_CBC_MODE|EVP_CIPH_FLAG_DEFAULT_ASN1|EVP_CIPH_FLAG_AEAD_CIPHER, + aesni_cbc_hmac_sha1_init_key, + aesni_cbc_hmac_sha1_cipher, + NULL, + sizeof(EVP_AES_HMAC_SHA1), + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_set_asn1_iv, + EVP_CIPH_FLAG_DEFAULT_ASN1?NULL:EVP_CIPHER_get_asn1_iv, + aesni_cbc_hmac_sha1_ctrl, + NULL + }; + +const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) + { + return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? + &aesni_128_cbc_hmac_sha1_cipher:NULL); + } + +const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) + { + return(OPENSSL_ia32cap_P[1]&AESNI_CAPABLE? + &aesni_256_cbc_hmac_sha1_cipher:NULL); + } +#else +const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void) + { + return NULL; + } +const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void) + { + return NULL; + } +#endif +#endif diff --git a/src/lib/libcrypto/evp/e_des3.c b/src/lib/libcrypto/evp/e_des3.c index 3232cfe024..1e69972662 100644 --- a/src/lib/libcrypto/evp/e_des3.c +++ b/src/lib/libcrypto/evp/e_des3.c @@ -65,6 +65,8 @@ #include #include +#ifndef OPENSSL_FIPS + static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv,int enc); @@ -311,3 +313,4 @@ const EVP_CIPHER *EVP_des_ede3(void) return &des_ede3_ecb; } #endif +#endif diff --git a/src/lib/libcrypto/evp/e_null.c b/src/lib/libcrypto/evp/e_null.c index 7cf50e1416..f0c1f78b5f 100644 --- a/src/lib/libcrypto/evp/e_null.c +++ b/src/lib/libcrypto/evp/e_null.c @@ -61,6 +61,8 @@ #include #include +#ifndef OPENSSL_FIPS + static int null_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, const unsigned char *iv,int enc); static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, @@ -99,4 +101,4 @@ static int null_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, memcpy((char *)out,(const char *)in,inl); return 1; } - +#endif diff --git a/src/lib/libcrypto/evp/e_rc2.c b/src/lib/libcrypto/evp/e_rc2.c index f78d781129..d4c33b58d4 100644 --- a/src/lib/libcrypto/evp/e_rc2.c +++ b/src/lib/libcrypto/evp/e_rc2.c @@ -183,7 +183,8 @@ static int rc2_get_asn1_type_and_iv(EVP_CIPHER_CTX *c, ASN1_TYPE *type) key_bits =rc2_magic_to_meth((int)num); if (!key_bits) return(-1); - if(i > 0) EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1); + if(i > 0 && !EVP_CipherInit_ex(c, NULL, NULL, NULL, iv, -1)) + return -1; EVP_CIPHER_CTX_ctrl(c, EVP_CTRL_SET_RC2_KEY_BITS, key_bits, NULL); EVP_CIPHER_CTX_set_key_length(c, key_bits / 8); } diff --git a/src/lib/libcrypto/evp/e_rc4.c b/src/lib/libcrypto/evp/e_rc4.c index 8b5175e0fd..b4f6bda82d 100644 --- a/src/lib/libcrypto/evp/e_rc4.c +++ b/src/lib/libcrypto/evp/e_rc4.c @@ -62,6 +62,7 @@ #ifndef OPENSSL_NO_RC4 #include +#include "evp_locl.h" #include #include diff --git a/src/lib/libcrypto/evp/e_rc4_hmac_md5.c b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c new file mode 100644 index 0000000000..56563191ba --- /dev/null +++ b/src/lib/libcrypto/evp/e_rc4_hmac_md5.c @@ -0,0 +1,298 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include + +#include +#include + +#if !defined(OPENSSL_NO_RC4) && !defined(OPENSSL_NO_MD5) + +#include +#include +#include +#include + +#ifndef EVP_CIPH_FLAG_AEAD_CIPHER +#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 +#define EVP_CTRL_AEAD_TLS1_AAD 0x16 +#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 +#endif + +/* FIXME: surely this is available elsewhere? */ +#define EVP_RC4_KEY_SIZE 16 + +typedef struct + { + RC4_KEY ks; + MD5_CTX head,tail,md; + size_t payload_length; + } EVP_RC4_HMAC_MD5; + +#define NO_PAYLOAD_LENGTH ((size_t)-1) + +void rc4_md5_enc (RC4_KEY *key, const void *in0, void *out, + MD5_CTX *ctx,const void *inp,size_t blocks); + +#define data(ctx) ((EVP_RC4_HMAC_MD5 *)(ctx)->cipher_data) + +static int rc4_hmac_md5_init_key(EVP_CIPHER_CTX *ctx, + const unsigned char *inkey, + const unsigned char *iv, int enc) + { + EVP_RC4_HMAC_MD5 *key = data(ctx); + + RC4_set_key(&key->ks,EVP_CIPHER_CTX_key_length(ctx), + inkey); + + MD5_Init(&key->head); /* handy when benchmarking */ + key->tail = key->head; + key->md = key->head; + + key->payload_length = NO_PAYLOAD_LENGTH; + + return 1; + } + +#if !defined(OPENSSL_NO_ASM) && ( \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_AMD64) || defined(_M_X64) || \ + defined(__INTEL__) ) && \ + !(defined(__APPLE__) && defined(__MACH__)) +#define STITCHED_CALL +#endif + +#if !defined(STITCHED_CALL) +#define rc4_off 0 +#define md5_off 0 +#endif + +static int rc4_hmac_md5_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + EVP_RC4_HMAC_MD5 *key = data(ctx); +#if defined(STITCHED_CALL) + size_t rc4_off = 32-1-(key->ks.x&(32-1)), /* 32 is $MOD from rc4_md5-x86_64.pl */ + md5_off = MD5_CBLOCK-key->md.num, + blocks; + unsigned int l; + extern unsigned int OPENSSL_ia32cap_P[]; +#endif + size_t plen = key->payload_length; + + if (plen!=NO_PAYLOAD_LENGTH && len!=(plen+MD5_DIGEST_LENGTH)) return 0; + + if (ctx->encrypt) { + if (plen==NO_PAYLOAD_LENGTH) plen = len; +#if defined(STITCHED_CALL) + /* cipher has to "fall behind" */ + if (rc4_off>md5_off) md5_off+=MD5_CBLOCK; + + if (plen>md5_off && (blocks=(plen-md5_off)/MD5_CBLOCK) && + (OPENSSL_ia32cap_P[0]&(1<<20))==0) { + MD5_Update(&key->md,in,md5_off); + RC4(&key->ks,rc4_off,in,out); + + rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, + &key->md,in+md5_off,blocks); + blocks *= MD5_CBLOCK; + rc4_off += blocks; + md5_off += blocks; + key->md.Nh += blocks>>29; + key->md.Nl += blocks<<=3; + if (key->md.Nl<(unsigned int)blocks) key->md.Nh++; + } else { + rc4_off = 0; + md5_off = 0; + } +#endif + MD5_Update(&key->md,in+md5_off,plen-md5_off); + + if (plen!=len) { /* "TLS" mode of operation */ + if (in!=out) + memcpy(out+rc4_off,in+rc4_off,plen-rc4_off); + + /* calculate HMAC and append it to payload */ + MD5_Final(out+plen,&key->md); + key->md = key->tail; + MD5_Update(&key->md,out+plen,MD5_DIGEST_LENGTH); + MD5_Final(out+plen,&key->md); + /* encrypt HMAC at once */ + RC4(&key->ks,len-rc4_off,out+rc4_off,out+rc4_off); + } else { + RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); + } + } else { + unsigned char mac[MD5_DIGEST_LENGTH]; +#if defined(STITCHED_CALL) + /* digest has to "fall behind" */ + if (md5_off>rc4_off) rc4_off += 2*MD5_CBLOCK; + else rc4_off += MD5_CBLOCK; + + if (len>rc4_off && (blocks=(len-rc4_off)/MD5_CBLOCK) && + (OPENSSL_ia32cap_P[0]&(1<<20))==0) { + RC4(&key->ks,rc4_off,in,out); + MD5_Update(&key->md,out,md5_off); + + rc4_md5_enc(&key->ks,in+rc4_off,out+rc4_off, + &key->md,out+md5_off,blocks); + blocks *= MD5_CBLOCK; + rc4_off += blocks; + md5_off += blocks; + l = (key->md.Nl+(blocks<<3))&0xffffffffU; + if (lmd.Nl) key->md.Nh++; + key->md.Nl = l; + key->md.Nh += blocks>>29; + } else { + md5_off=0; + rc4_off=0; + } +#endif + /* decrypt HMAC at once */ + RC4(&key->ks,len-rc4_off,in+rc4_off,out+rc4_off); + if (plen!=NO_PAYLOAD_LENGTH) { /* "TLS" mode of operation */ + MD5_Update(&key->md,out+md5_off,plen-md5_off); + + /* calculate HMAC and verify it */ + MD5_Final(mac,&key->md); + key->md = key->tail; + MD5_Update(&key->md,mac,MD5_DIGEST_LENGTH); + MD5_Final(mac,&key->md); + + if (memcmp(out+plen,mac,MD5_DIGEST_LENGTH)) + return 0; + } else { + MD5_Update(&key->md,out+md5_off,len-md5_off); + } + } + + key->payload_length = NO_PAYLOAD_LENGTH; + + return 1; + } + +static int rc4_hmac_md5_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) + { + EVP_RC4_HMAC_MD5 *key = data(ctx); + + switch (type) + { + case EVP_CTRL_AEAD_SET_MAC_KEY: + { + unsigned int i; + unsigned char hmac_key[64]; + + memset (hmac_key,0,sizeof(hmac_key)); + + if (arg > (int)sizeof(hmac_key)) { + MD5_Init(&key->head); + MD5_Update(&key->head,ptr,arg); + MD5_Final(hmac_key,&key->head); + } else { + memcpy(hmac_key,ptr,arg); + } + + for (i=0;ihead); + MD5_Update(&key->head,hmac_key,sizeof(hmac_key)); + + for (i=0;itail); + MD5_Update(&key->tail,hmac_key,sizeof(hmac_key)); + + return 1; + } + case EVP_CTRL_AEAD_TLS1_AAD: + { + unsigned char *p=ptr; + unsigned int len=p[arg-2]<<8|p[arg-1]; + + if (!ctx->encrypt) + { + len -= MD5_DIGEST_LENGTH; + p[arg-2] = len>>8; + p[arg-1] = len; + } + key->payload_length=len; + key->md = key->head; + MD5_Update(&key->md,p,arg); + + return MD5_DIGEST_LENGTH; + } + default: + return -1; + } + } + +static EVP_CIPHER r4_hmac_md5_cipher= + { +#ifdef NID_rc4_hmac_md5 + NID_rc4_hmac_md5, +#else + NID_undef, +#endif + 1,EVP_RC4_KEY_SIZE,0, + EVP_CIPH_STREAM_CIPHER|EVP_CIPH_VARIABLE_LENGTH|EVP_CIPH_FLAG_AEAD_CIPHER, + rc4_hmac_md5_init_key, + rc4_hmac_md5_cipher, + NULL, + sizeof(EVP_RC4_HMAC_MD5), + NULL, + NULL, + rc4_hmac_md5_ctrl, + NULL + }; + +const EVP_CIPHER *EVP_rc4_hmac_md5(void) + { + return(&r4_hmac_md5_cipher); + } +#endif diff --git a/src/lib/libcrypto/evp/evp.h b/src/lib/libcrypto/evp/evp.h index 9f9795e2d9..0d1b20a7d3 100644 --- a/src/lib/libcrypto/evp/evp.h +++ b/src/lib/libcrypto/evp/evp.h @@ -83,7 +83,7 @@ #define EVP_RC5_32_12_16_KEY_SIZE 16 */ #define EVP_MAX_MD_SIZE 64 /* longest known is SHA512 */ -#define EVP_MAX_KEY_LENGTH 32 +#define EVP_MAX_KEY_LENGTH 64 #define EVP_MAX_IV_LENGTH 16 #define EVP_MAX_BLOCK_LENGTH 32 @@ -116,6 +116,7 @@ #define EVP_PKEY_DH NID_dhKeyAgreement #define EVP_PKEY_EC NID_X9_62_id_ecPublicKey #define EVP_PKEY_HMAC NID_hmac +#define EVP_PKEY_CMAC NID_cmac #ifdef __cplusplus extern "C" { @@ -216,6 +217,8 @@ typedef int evp_verify_method(int type,const unsigned char *m, #define EVP_MD_FLAG_DIGALGID_CUSTOM 0x0018 +#define EVP_MD_FLAG_FIPS 0x0400 /* Note if suitable for use in FIPS mode */ + /* Digest ctrls */ #define EVP_MD_CTRL_DIGALGID 0x1 @@ -325,6 +328,10 @@ struct evp_cipher_st #define EVP_CIPH_CBC_MODE 0x2 #define EVP_CIPH_CFB_MODE 0x3 #define EVP_CIPH_OFB_MODE 0x4 +#define EVP_CIPH_CTR_MODE 0x5 +#define EVP_CIPH_GCM_MODE 0x6 +#define EVP_CIPH_CCM_MODE 0x7 +#define EVP_CIPH_XTS_MODE 0x10001 #define EVP_CIPH_MODE 0xF0007 /* Set if variable length cipher */ #define EVP_CIPH_VARIABLE_LENGTH 0x8 @@ -346,6 +353,15 @@ struct evp_cipher_st #define EVP_CIPH_FLAG_DEFAULT_ASN1 0x1000 /* Buffer length in bits not bytes: CFB1 mode only */ #define EVP_CIPH_FLAG_LENGTH_BITS 0x2000 +/* Note if suitable for use in FIPS mode */ +#define EVP_CIPH_FLAG_FIPS 0x4000 +/* Allow non FIPS cipher in FIPS mode */ +#define EVP_CIPH_FLAG_NON_FIPS_ALLOW 0x8000 +/* Cipher handles any and all padding logic as well + * as finalisation. + */ +#define EVP_CIPH_FLAG_CUSTOM_CIPHER 0x100000 +#define EVP_CIPH_FLAG_AEAD_CIPHER 0x200000 /* ctrl() values */ @@ -358,6 +374,34 @@ struct evp_cipher_st #define EVP_CTRL_RAND_KEY 0x6 #define EVP_CTRL_PBE_PRF_NID 0x7 #define EVP_CTRL_COPY 0x8 +#define EVP_CTRL_GCM_SET_IVLEN 0x9 +#define EVP_CTRL_GCM_GET_TAG 0x10 +#define EVP_CTRL_GCM_SET_TAG 0x11 +#define EVP_CTRL_GCM_SET_IV_FIXED 0x12 +#define EVP_CTRL_GCM_IV_GEN 0x13 +#define EVP_CTRL_CCM_SET_IVLEN EVP_CTRL_GCM_SET_IVLEN +#define EVP_CTRL_CCM_GET_TAG EVP_CTRL_GCM_GET_TAG +#define EVP_CTRL_CCM_SET_TAG EVP_CTRL_GCM_SET_TAG +#define EVP_CTRL_CCM_SET_L 0x14 +#define EVP_CTRL_CCM_SET_MSGLEN 0x15 +/* AEAD cipher deduces payload length and returns number of bytes + * required to store MAC and eventual padding. Subsequent call to + * EVP_Cipher even appends/verifies MAC. + */ +#define EVP_CTRL_AEAD_TLS1_AAD 0x16 +/* Used by composite AEAD ciphers, no-op in GCM, CCM... */ +#define EVP_CTRL_AEAD_SET_MAC_KEY 0x17 +/* Set the GCM invocation field, decrypt only */ +#define EVP_CTRL_GCM_SET_IV_INV 0x18 + +/* GCM TLS constants */ +/* Length of fixed part of IV derived from PRF */ +#define EVP_GCM_TLS_FIXED_IV_LEN 4 +/* Length of explicit part of IV part of TLS records */ +#define EVP_GCM_TLS_EXPLICIT_IV_LEN 8 +/* Length of tag for TLS */ +#define EVP_GCM_TLS_TAG_LEN 16 + typedef struct evp_cipher_info_st { @@ -375,7 +419,7 @@ struct evp_cipher_ctx_st unsigned char oiv[EVP_MAX_IV_LENGTH]; /* original iv */ unsigned char iv[EVP_MAX_IV_LENGTH]; /* working iv */ unsigned char buf[EVP_MAX_BLOCK_LENGTH];/* saved partial block */ - int num; /* used by cfb/ofb mode */ + int num; /* used by cfb/ofb/ctr mode */ void *app_data; /* application stuff */ int key_len; /* May change for variable length cipher */ @@ -695,6 +739,9 @@ const EVP_MD *EVP_dev_crypto_md5(void); #ifndef OPENSSL_NO_RC4 const EVP_CIPHER *EVP_rc4(void); const EVP_CIPHER *EVP_rc4_40(void); +#ifndef OPENSSL_NO_MD5 +const EVP_CIPHER *EVP_rc4_hmac_md5(void); +#endif #endif #ifndef OPENSSL_NO_IDEA const EVP_CIPHER *EVP_idea_ecb(void); @@ -741,9 +788,10 @@ const EVP_CIPHER *EVP_aes_128_cfb8(void); const EVP_CIPHER *EVP_aes_128_cfb128(void); # define EVP_aes_128_cfb EVP_aes_128_cfb128 const EVP_CIPHER *EVP_aes_128_ofb(void); -#if 0 const EVP_CIPHER *EVP_aes_128_ctr(void); -#endif +const EVP_CIPHER *EVP_aes_128_gcm(void); +const EVP_CIPHER *EVP_aes_128_ccm(void); +const EVP_CIPHER *EVP_aes_128_xts(void); const EVP_CIPHER *EVP_aes_192_ecb(void); const EVP_CIPHER *EVP_aes_192_cbc(void); const EVP_CIPHER *EVP_aes_192_cfb1(void); @@ -751,9 +799,9 @@ const EVP_CIPHER *EVP_aes_192_cfb8(void); const EVP_CIPHER *EVP_aes_192_cfb128(void); # define EVP_aes_192_cfb EVP_aes_192_cfb128 const EVP_CIPHER *EVP_aes_192_ofb(void); -#if 0 const EVP_CIPHER *EVP_aes_192_ctr(void); -#endif +const EVP_CIPHER *EVP_aes_192_gcm(void); +const EVP_CIPHER *EVP_aes_192_ccm(void); const EVP_CIPHER *EVP_aes_256_ecb(void); const EVP_CIPHER *EVP_aes_256_cbc(void); const EVP_CIPHER *EVP_aes_256_cfb1(void); @@ -761,8 +809,13 @@ const EVP_CIPHER *EVP_aes_256_cfb8(void); const EVP_CIPHER *EVP_aes_256_cfb128(void); # define EVP_aes_256_cfb EVP_aes_256_cfb128 const EVP_CIPHER *EVP_aes_256_ofb(void); -#if 0 const EVP_CIPHER *EVP_aes_256_ctr(void); +const EVP_CIPHER *EVP_aes_256_gcm(void); +const EVP_CIPHER *EVP_aes_256_ccm(void); +const EVP_CIPHER *EVP_aes_256_xts(void); +#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1) +const EVP_CIPHER *EVP_aes_128_cbc_hmac_sha1(void); +const EVP_CIPHER *EVP_aes_256_cbc_hmac_sha1(void); #endif #endif #ifndef OPENSSL_NO_CAMELLIA @@ -1047,13 +1100,22 @@ void EVP_PKEY_asn1_set_ctrl(EVP_PKEY_ASN1_METHOD *ameth, #define EVP_PKEY_CTRL_CMS_DECRYPT 10 #define EVP_PKEY_CTRL_CMS_SIGN 11 +#define EVP_PKEY_CTRL_CIPHER 12 + #define EVP_PKEY_ALG_CTRL 0x1000 #define EVP_PKEY_FLAG_AUTOARGLEN 2 +/* Method handles all operations: don't assume any digest related + * defaults. + */ +#define EVP_PKEY_FLAG_SIGCTX_CUSTOM 4 const EVP_PKEY_METHOD *EVP_PKEY_meth_find(int type); EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags); +void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags, + const EVP_PKEY_METHOD *meth); +void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src); void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth); int EVP_PKEY_meth_add0(const EVP_PKEY_METHOD *pmeth); @@ -1071,7 +1133,7 @@ int EVP_PKEY_CTX_get_operation(EVP_PKEY_CTX *ctx); void EVP_PKEY_CTX_set0_keygen_info(EVP_PKEY_CTX *ctx, int *dat, int datlen); EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, - unsigned char *key, int keylen); + const unsigned char *key, int keylen); void EVP_PKEY_CTX_set_data(EVP_PKEY_CTX *ctx, void *data); void *EVP_PKEY_CTX_get_data(EVP_PKEY_CTX *ctx); @@ -1190,8 +1252,13 @@ void ERR_load_EVP_strings(void); /* Error codes for the EVP functions. */ /* Function codes. */ +#define EVP_F_AESNI_INIT_KEY 165 +#define EVP_F_AESNI_XTS_CIPHER 176 #define EVP_F_AES_INIT_KEY 133 +#define EVP_F_AES_XTS 172 +#define EVP_F_AES_XTS_CIPHER 175 #define EVP_F_CAMELLIA_INIT_KEY 159 +#define EVP_F_CMAC_INIT 173 #define EVP_F_D2I_PKEY 100 #define EVP_F_DO_SIGVER_INIT 161 #define EVP_F_DSAPKEY2PKCS8 134 @@ -1246,15 +1313,24 @@ void ERR_load_EVP_strings(void); #define EVP_F_EVP_RIJNDAEL 126 #define EVP_F_EVP_SIGNFINAL 107 #define EVP_F_EVP_VERIFYFINAL 108 +#define EVP_F_FIPS_CIPHERINIT 166 +#define EVP_F_FIPS_CIPHER_CTX_COPY 170 +#define EVP_F_FIPS_CIPHER_CTX_CTRL 167 +#define EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH 171 +#define EVP_F_FIPS_DIGESTINIT 168 +#define EVP_F_FIPS_MD_CTX_COPY 169 +#define EVP_F_HMAC_INIT_EX 174 #define EVP_F_INT_CTX_NEW 157 #define EVP_F_PKCS5_PBE_KEYIVGEN 117 #define EVP_F_PKCS5_V2_PBE_KEYIVGEN 118 +#define EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN 164 #define EVP_F_PKCS8_SET_BROKEN 112 #define EVP_F_PKEY_SET_TYPE 158 #define EVP_F_RC2_MAGIC_TO_METH 109 #define EVP_F_RC5_CTRL 125 /* Reason codes. */ +#define EVP_R_AES_IV_SETUP_FAILED 162 #define EVP_R_AES_KEY_SETUP_FAILED 143 #define EVP_R_ASN1_LIB 140 #define EVP_R_BAD_BLOCK_LENGTH 136 @@ -1272,6 +1348,7 @@ void ERR_load_EVP_strings(void); #define EVP_R_DECODE_ERROR 114 #define EVP_R_DIFFERENT_KEY_TYPES 101 #define EVP_R_DIFFERENT_PARAMETERS 153 +#define EVP_R_DISABLED_FOR_FIPS 163 #define EVP_R_ENCODE_ERROR 115 #define EVP_R_EVP_PBE_CIPHERINIT_ERROR 119 #define EVP_R_EXPECTING_AN_RSA_KEY 127 @@ -1303,6 +1380,7 @@ void ERR_load_EVP_strings(void); #define EVP_R_PRIVATE_KEY_DECODE_ERROR 145 #define EVP_R_PRIVATE_KEY_ENCODE_ERROR 146 #define EVP_R_PUBLIC_KEY_NOT_RSA 106 +#define EVP_R_TOO_LARGE 164 #define EVP_R_UNKNOWN_CIPHER 160 #define EVP_R_UNKNOWN_DIGEST 161 #define EVP_R_UNKNOWN_PBE_ALGORITHM 121 diff --git a/src/lib/libcrypto/evp/evp_enc.c b/src/lib/libcrypto/evp/evp_enc.c index c268d25cb4..0c54f05e6e 100644 --- a/src/lib/libcrypto/evp/evp_enc.c +++ b/src/lib/libcrypto/evp/evp_enc.c @@ -64,8 +64,18 @@ #ifndef OPENSSL_NO_ENGINE #include #endif +#ifdef OPENSSL_FIPS +#include +#endif #include "evp_locl.h" +#ifdef OPENSSL_FIPS +#define M_do_cipher(ctx, out, in, inl) FIPS_cipher(ctx, out, in, inl) +#else +#define M_do_cipher(ctx, out, in, inl) ctx->cipher->do_cipher(ctx, out, in, inl) +#endif + + const char EVP_version[]="EVP" OPENSSL_VERSION_PTEXT; void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx) @@ -115,10 +125,14 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp /* Ensure a context left lying around from last time is cleared * (the previous check attempted to avoid this if the same * ENGINE and EVP_CIPHER could be used). */ - EVP_CIPHER_CTX_cleanup(ctx); - - /* Restore encrypt field: it is zeroed by cleanup */ - ctx->encrypt = enc; + if (ctx->cipher) + { + unsigned long flags = ctx->flags; + EVP_CIPHER_CTX_cleanup(ctx); + /* Restore encrypt and flags */ + ctx->encrypt = enc; + ctx->flags = flags; + } #ifndef OPENSSL_NO_ENGINE if(impl) { @@ -155,6 +169,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp ctx->engine = NULL; #endif +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_cipherinit(ctx, cipher, key, iv, enc); +#endif ctx->cipher=cipher; if (ctx->cipher->ctx_size) { @@ -187,6 +205,10 @@ int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, ENGINE *imp } #ifndef OPENSSL_NO_ENGINE skip_to_init: +#endif +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_cipherinit(ctx, cipher, key, iv, enc); #endif /* we assume block size is a power of 2 in *cryptUpdate */ OPENSSL_assert(ctx->cipher->block_size == 1 @@ -214,6 +236,13 @@ skip_to_init: memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx)); break; + case EVP_CIPH_CTR_MODE: + ctx->num = 0; + /* Don't reuse IV for CTR mode */ + if(iv) + memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx)); + break; + default: return 0; break; @@ -280,6 +309,16 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, { int i,j,bl; + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) + { + i = M_do_cipher(ctx, out, in, inl); + if (i < 0) + return 0; + else + *outl = i; + return 1; + } + if (inl <= 0) { *outl = 0; @@ -288,7 +327,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, if(ctx->buf_len == 0 && (inl&(ctx->block_mask)) == 0) { - if(ctx->cipher->do_cipher(ctx,out,in,inl)) + if(M_do_cipher(ctx,out,in,inl)) { *outl=inl; return 1; @@ -315,7 +354,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, { j=bl-i; memcpy(&(ctx->buf[i]),in,j); - if(!ctx->cipher->do_cipher(ctx,out,ctx->buf,bl)) return 0; + if(!M_do_cipher(ctx,out,ctx->buf,bl)) return 0; inl-=j; in+=j; out+=bl; @@ -328,7 +367,7 @@ int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, inl-=i; if (inl > 0) { - if(!ctx->cipher->do_cipher(ctx,out,in,inl)) return 0; + if(!M_do_cipher(ctx,out,in,inl)) return 0; *outl+=inl; } @@ -350,6 +389,16 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl) int n,ret; unsigned int i, b, bl; + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) + { + ret = M_do_cipher(ctx, out, NULL, 0); + if (ret < 0) + return 0; + else + *outl = ret; + return 1; + } + b=ctx->cipher->block_size; OPENSSL_assert(b <= sizeof ctx->buf); if (b == 1) @@ -372,7 +421,7 @@ int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl) n=b-bl; for (i=bl; ibuf[i]=n; - ret=ctx->cipher->do_cipher(ctx,out,ctx->buf,b); + ret=M_do_cipher(ctx,out,ctx->buf,b); if(ret) @@ -387,6 +436,19 @@ int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl, int fix_len; unsigned int b; + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) + { + fix_len = M_do_cipher(ctx, out, in, inl); + if (fix_len < 0) + { + *outl = 0; + return 0; + } + else + *outl = fix_len; + return 1; + } + if (inl <= 0) { *outl = 0; @@ -440,8 +502,18 @@ int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl) { int i,n; unsigned int b; - *outl=0; + + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) + { + i = M_do_cipher(ctx, out, NULL, 0); + if (i < 0) + return 0; + else + *outl = i; + return 1; + } + b=ctx->cipher->block_size; if (ctx->flags & EVP_CIPH_NO_PADDING) { @@ -496,6 +568,7 @@ void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx) int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c) { +#ifndef OPENSSL_FIPS if (c->cipher != NULL) { if(c->cipher->cleanup && !c->cipher->cleanup(c)) @@ -506,11 +579,15 @@ int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c) } if (c->cipher_data) OPENSSL_free(c->cipher_data); +#endif #ifndef OPENSSL_NO_ENGINE if (c->engine) /* The EVP_CIPHER we used belongs to an ENGINE, release the * functional reference we held for this reason. */ ENGINE_finish(c->engine); +#endif +#ifdef OPENSSL_FIPS + FIPS_cipher_ctx_cleanup(c); #endif memset(c,0,sizeof(EVP_CIPHER_CTX)); return 1; diff --git a/src/lib/libcrypto/evp/evp_err.c b/src/lib/libcrypto/evp/evp_err.c index d8bfec0959..db0f76d59b 100644 --- a/src/lib/libcrypto/evp/evp_err.c +++ b/src/lib/libcrypto/evp/evp_err.c @@ -1,6 +1,6 @@ /* crypto/evp/evp_err.c */ /* ==================================================================== - * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -70,8 +70,13 @@ static ERR_STRING_DATA EVP_str_functs[]= { +{ERR_FUNC(EVP_F_AESNI_INIT_KEY), "AESNI_INIT_KEY"}, +{ERR_FUNC(EVP_F_AESNI_XTS_CIPHER), "AESNI_XTS_CIPHER"}, {ERR_FUNC(EVP_F_AES_INIT_KEY), "AES_INIT_KEY"}, +{ERR_FUNC(EVP_F_AES_XTS), "AES_XTS"}, +{ERR_FUNC(EVP_F_AES_XTS_CIPHER), "AES_XTS_CIPHER"}, {ERR_FUNC(EVP_F_CAMELLIA_INIT_KEY), "CAMELLIA_INIT_KEY"}, +{ERR_FUNC(EVP_F_CMAC_INIT), "CMAC_INIT"}, {ERR_FUNC(EVP_F_D2I_PKEY), "D2I_PKEY"}, {ERR_FUNC(EVP_F_DO_SIGVER_INIT), "DO_SIGVER_INIT"}, {ERR_FUNC(EVP_F_DSAPKEY2PKCS8), "DSAPKEY2PKCS8"}, @@ -86,7 +91,7 @@ static ERR_STRING_DATA EVP_str_functs[]= {ERR_FUNC(EVP_F_EVP_DIGESTINIT_EX), "EVP_DigestInit_ex"}, {ERR_FUNC(EVP_F_EVP_ENCRYPTFINAL_EX), "EVP_EncryptFinal_ex"}, {ERR_FUNC(EVP_F_EVP_MD_CTX_COPY_EX), "EVP_MD_CTX_copy_ex"}, -{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_SIZE"}, +{ERR_FUNC(EVP_F_EVP_MD_SIZE), "EVP_MD_size"}, {ERR_FUNC(EVP_F_EVP_OPENINIT), "EVP_OpenInit"}, {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD), "EVP_PBE_alg_add"}, {ERR_FUNC(EVP_F_EVP_PBE_ALG_ADD_TYPE), "EVP_PBE_alg_add_type"}, @@ -126,9 +131,17 @@ static ERR_STRING_DATA EVP_str_functs[]= {ERR_FUNC(EVP_F_EVP_RIJNDAEL), "EVP_RIJNDAEL"}, {ERR_FUNC(EVP_F_EVP_SIGNFINAL), "EVP_SignFinal"}, {ERR_FUNC(EVP_F_EVP_VERIFYFINAL), "EVP_VerifyFinal"}, +{ERR_FUNC(EVP_F_FIPS_CIPHERINIT), "FIPS_CIPHERINIT"}, +{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_COPY), "FIPS_CIPHER_CTX_COPY"}, +{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_CTRL), "FIPS_CIPHER_CTX_CTRL"}, +{ERR_FUNC(EVP_F_FIPS_CIPHER_CTX_SET_KEY_LENGTH), "FIPS_CIPHER_CTX_SET_KEY_LENGTH"}, +{ERR_FUNC(EVP_F_FIPS_DIGESTINIT), "FIPS_DIGESTINIT"}, +{ERR_FUNC(EVP_F_FIPS_MD_CTX_COPY), "FIPS_MD_CTX_COPY"}, +{ERR_FUNC(EVP_F_HMAC_INIT_EX), "HMAC_Init_ex"}, {ERR_FUNC(EVP_F_INT_CTX_NEW), "INT_CTX_NEW"}, {ERR_FUNC(EVP_F_PKCS5_PBE_KEYIVGEN), "PKCS5_PBE_keyivgen"}, {ERR_FUNC(EVP_F_PKCS5_V2_PBE_KEYIVGEN), "PKCS5_v2_PBE_keyivgen"}, +{ERR_FUNC(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN), "PKCS5_V2_PBKDF2_KEYIVGEN"}, {ERR_FUNC(EVP_F_PKCS8_SET_BROKEN), "PKCS8_set_broken"}, {ERR_FUNC(EVP_F_PKEY_SET_TYPE), "PKEY_SET_TYPE"}, {ERR_FUNC(EVP_F_RC2_MAGIC_TO_METH), "RC2_MAGIC_TO_METH"}, @@ -138,6 +151,7 @@ static ERR_STRING_DATA EVP_str_functs[]= static ERR_STRING_DATA EVP_str_reasons[]= { +{ERR_REASON(EVP_R_AES_IV_SETUP_FAILED) ,"aes iv setup failed"}, {ERR_REASON(EVP_R_AES_KEY_SETUP_FAILED) ,"aes key setup failed"}, {ERR_REASON(EVP_R_ASN1_LIB) ,"asn1 lib"}, {ERR_REASON(EVP_R_BAD_BLOCK_LENGTH) ,"bad block length"}, @@ -155,6 +169,7 @@ static ERR_STRING_DATA EVP_str_reasons[]= {ERR_REASON(EVP_R_DECODE_ERROR) ,"decode error"}, {ERR_REASON(EVP_R_DIFFERENT_KEY_TYPES) ,"different key types"}, {ERR_REASON(EVP_R_DIFFERENT_PARAMETERS) ,"different parameters"}, +{ERR_REASON(EVP_R_DISABLED_FOR_FIPS) ,"disabled for fips"}, {ERR_REASON(EVP_R_ENCODE_ERROR) ,"encode error"}, {ERR_REASON(EVP_R_EVP_PBE_CIPHERINIT_ERROR),"evp pbe cipherinit error"}, {ERR_REASON(EVP_R_EXPECTING_AN_RSA_KEY) ,"expecting an rsa key"}, @@ -186,6 +201,7 @@ static ERR_STRING_DATA EVP_str_reasons[]= {ERR_REASON(EVP_R_PRIVATE_KEY_DECODE_ERROR),"private key decode error"}, {ERR_REASON(EVP_R_PRIVATE_KEY_ENCODE_ERROR),"private key encode error"}, {ERR_REASON(EVP_R_PUBLIC_KEY_NOT_RSA) ,"public key not rsa"}, +{ERR_REASON(EVP_R_TOO_LARGE) ,"too large"}, {ERR_REASON(EVP_R_UNKNOWN_CIPHER) ,"unknown cipher"}, {ERR_REASON(EVP_R_UNKNOWN_DIGEST) ,"unknown digest"}, {ERR_REASON(EVP_R_UNKNOWN_PBE_ALGORITHM) ,"unknown pbe algorithm"}, diff --git a/src/lib/libcrypto/evp/evp_key.c b/src/lib/libcrypto/evp/evp_key.c index 839d6a3a16..7961fbebf2 100644 --- a/src/lib/libcrypto/evp/evp_key.c +++ b/src/lib/libcrypto/evp/evp_key.c @@ -120,7 +120,7 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md, unsigned char md_buf[EVP_MAX_MD_SIZE]; int niv,nkey,addmd=0; unsigned int mds=0,i; - + int rv = 0; nkey=type->key_len; niv=type->iv_len; OPENSSL_assert(nkey <= EVP_MAX_KEY_LENGTH); @@ -134,17 +134,24 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md, if (!EVP_DigestInit_ex(&c,md, NULL)) return 0; if (addmd++) - EVP_DigestUpdate(&c,&(md_buf[0]),mds); - EVP_DigestUpdate(&c,data,datal); + if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds)) + goto err; + if (!EVP_DigestUpdate(&c,data,datal)) + goto err; if (salt != NULL) - EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN); - EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds); + if (!EVP_DigestUpdate(&c,salt,PKCS5_SALT_LEN)) + goto err; + if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds)) + goto err; for (i=1; i<(unsigned int)count; i++) { - EVP_DigestInit_ex(&c,md, NULL); - EVP_DigestUpdate(&c,&(md_buf[0]),mds); - EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds); + if (!EVP_DigestInit_ex(&c,md, NULL)) + goto err; + if (!EVP_DigestUpdate(&c,&(md_buf[0]),mds)) + goto err; + if (!EVP_DigestFinal_ex(&c,&(md_buf[0]),&mds)) + goto err; } i=0; if (nkey) @@ -173,8 +180,10 @@ int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md, } if ((nkey == 0) && (niv == 0)) break; } + rv = type->key_len; + err: EVP_MD_CTX_cleanup(&c); OPENSSL_cleanse(&(md_buf[0]),EVP_MAX_MD_SIZE); - return(type->key_len); + return rv; } diff --git a/src/lib/libcrypto/evp/evp_lib.c b/src/lib/libcrypto/evp/evp_lib.c index 40951a04f0..b180e4828a 100644 --- a/src/lib/libcrypto/evp/evp_lib.c +++ b/src/lib/libcrypto/evp/evp_lib.c @@ -67,6 +67,8 @@ int EVP_CIPHER_param_to_asn1(EVP_CIPHER_CTX *c, ASN1_TYPE *type) if (c->cipher->set_asn1_parameters != NULL) ret=c->cipher->set_asn1_parameters(c,type); + else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1) + ret=EVP_CIPHER_set_asn1_iv(c, type); else ret=-1; return(ret); @@ -78,6 +80,8 @@ int EVP_CIPHER_asn1_to_param(EVP_CIPHER_CTX *c, ASN1_TYPE *type) if (c->cipher->get_asn1_parameters != NULL) ret=c->cipher->get_asn1_parameters(c,type); + else if (c->cipher->flags & EVP_CIPH_FLAG_DEFAULT_ASN1) + ret=EVP_CIPHER_get_asn1_iv(c, type); else ret=-1; return(ret); diff --git a/src/lib/libcrypto/evp/evp_locl.h b/src/lib/libcrypto/evp/evp_locl.h index 292d74c188..08c0a66d39 100644 --- a/src/lib/libcrypto/evp/evp_locl.h +++ b/src/lib/libcrypto/evp/evp_locl.h @@ -343,3 +343,43 @@ struct evp_pkey_method_st } /* EVP_PKEY_METHOD */; void evp_pkey_set_cb_translate(BN_GENCB *cb, EVP_PKEY_CTX *ctx); + +int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen, + ASN1_TYPE *param, + const EVP_CIPHER *c, const EVP_MD *md, int en_de); + +#ifdef OPENSSL_FIPS + +#ifdef OPENSSL_DOING_MAKEDEPEND +#undef SHA1_Init +#undef SHA1_Update +#undef SHA224_Init +#undef SHA256_Init +#undef SHA384_Init +#undef SHA512_Init +#undef DES_set_key_unchecked +#endif + +#define RIPEMD160_Init private_RIPEMD160_Init +#define WHIRLPOOL_Init private_WHIRLPOOL_Init +#define MD5_Init private_MD5_Init +#define MD4_Init private_MD4_Init +#define MD2_Init private_MD2_Init +#define MDC2_Init private_MDC2_Init +#define SHA_Init private_SHA_Init +#define SHA1_Init private_SHA1_Init +#define SHA224_Init private_SHA224_Init +#define SHA256_Init private_SHA256_Init +#define SHA384_Init private_SHA384_Init +#define SHA512_Init private_SHA512_Init + +#define BF_set_key private_BF_set_key +#define CAST_set_key private_CAST_set_key +#define idea_set_encrypt_key private_idea_set_encrypt_key +#define SEED_set_key private_SEED_set_key +#define RC2_set_key private_RC2_set_key +#define RC4_set_key private_RC4_set_key +#define DES_set_key_unchecked private_DES_set_key_unchecked +#define Camellia_set_key private_Camellia_set_key + +#endif diff --git a/src/lib/libcrypto/evp/evp_pbe.c b/src/lib/libcrypto/evp/evp_pbe.c index c9d932d205..f8c32d825e 100644 --- a/src/lib/libcrypto/evp/evp_pbe.c +++ b/src/lib/libcrypto/evp/evp_pbe.c @@ -61,6 +61,7 @@ #include #include #include +#include "evp_locl.h" /* Password based encryption (PBE) functions */ @@ -87,6 +88,10 @@ static const EVP_PBE_CTL builtin_pbe[] = {EVP_PBE_TYPE_OUTER, NID_pbeWithSHA1AndRC2_CBC, NID_rc2_64_cbc, NID_sha1, PKCS5_PBE_keyivgen}, +#ifndef OPENSSL_NO_HMAC + {EVP_PBE_TYPE_OUTER, NID_id_pbkdf2, -1, -1, PKCS5_v2_PBKDF2_keyivgen}, +#endif + {EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And128BitRC4, NID_rc4, NID_sha1, PKCS12_PBE_keyivgen}, {EVP_PBE_TYPE_OUTER, NID_pbe_WithSHA1And40BitRC4, diff --git a/src/lib/libcrypto/evp/m_dss.c b/src/lib/libcrypto/evp/m_dss.c index 48c2689504..4ad63ada6f 100644 --- a/src/lib/libcrypto/evp/m_dss.c +++ b/src/lib/libcrypto/evp/m_dss.c @@ -66,6 +66,7 @@ #endif #ifndef OPENSSL_NO_SHA +#ifndef OPENSSL_FIPS static int init(EVP_MD_CTX *ctx) { return SHA1_Init(ctx->md_data); } @@ -97,3 +98,4 @@ const EVP_MD *EVP_dss(void) return(&dsa_md); } #endif +#endif diff --git a/src/lib/libcrypto/evp/m_dss1.c b/src/lib/libcrypto/evp/m_dss1.c index 4f03fb70e0..f80170efeb 100644 --- a/src/lib/libcrypto/evp/m_dss1.c +++ b/src/lib/libcrypto/evp/m_dss1.c @@ -68,6 +68,8 @@ #include #endif +#ifndef OPENSSL_FIPS + static int init(EVP_MD_CTX *ctx) { return SHA1_Init(ctx->md_data); } @@ -98,3 +100,4 @@ const EVP_MD *EVP_dss1(void) return(&dss1_md); } #endif +#endif diff --git a/src/lib/libcrypto/evp/m_ecdsa.c b/src/lib/libcrypto/evp/m_ecdsa.c index 8d87a49ebe..4b15fb0f6c 100644 --- a/src/lib/libcrypto/evp/m_ecdsa.c +++ b/src/lib/libcrypto/evp/m_ecdsa.c @@ -116,6 +116,8 @@ #include #ifndef OPENSSL_NO_SHA +#ifndef OPENSSL_FIPS + static int init(EVP_MD_CTX *ctx) { return SHA1_Init(ctx->md_data); } @@ -146,3 +148,4 @@ const EVP_MD *EVP_ecdsa(void) return(&ecdsa_md); } #endif +#endif diff --git a/src/lib/libcrypto/evp/m_md4.c b/src/lib/libcrypto/evp/m_md4.c index 1e0b7c5b42..6d47f61b27 100644 --- a/src/lib/libcrypto/evp/m_md4.c +++ b/src/lib/libcrypto/evp/m_md4.c @@ -69,6 +69,8 @@ #include #endif +#include "evp_locl.h" + static int init(EVP_MD_CTX *ctx) { return MD4_Init(ctx->md_data); } diff --git a/src/lib/libcrypto/evp/m_md5.c b/src/lib/libcrypto/evp/m_md5.c index 63c142119e..9a8bae0258 100644 --- a/src/lib/libcrypto/evp/m_md5.c +++ b/src/lib/libcrypto/evp/m_md5.c @@ -68,6 +68,7 @@ #ifndef OPENSSL_NO_RSA #include #endif +#include "evp_locl.h" static int init(EVP_MD_CTX *ctx) { return MD5_Init(ctx->md_data); } diff --git a/src/lib/libcrypto/evp/m_ripemd.c b/src/lib/libcrypto/evp/m_ripemd.c index a1d60ee78d..7bf4804cf8 100644 --- a/src/lib/libcrypto/evp/m_ripemd.c +++ b/src/lib/libcrypto/evp/m_ripemd.c @@ -68,6 +68,7 @@ #ifndef OPENSSL_NO_RSA #include #endif +#include "evp_locl.h" static int init(EVP_MD_CTX *ctx) { return RIPEMD160_Init(ctx->md_data); } diff --git a/src/lib/libcrypto/evp/m_sha1.c b/src/lib/libcrypto/evp/m_sha1.c index 9a2790fdea..3cb11f1ebb 100644 --- a/src/lib/libcrypto/evp/m_sha1.c +++ b/src/lib/libcrypto/evp/m_sha1.c @@ -59,6 +59,8 @@ #include #include "cryptlib.h" +#ifndef OPENSSL_FIPS + #ifndef OPENSSL_NO_SHA #include @@ -68,6 +70,7 @@ #include #endif + static int init(EVP_MD_CTX *ctx) { return SHA1_Init(ctx->md_data); } @@ -202,3 +205,5 @@ static const EVP_MD sha512_md= const EVP_MD *EVP_sha512(void) { return(&sha512_md); } #endif /* ifndef OPENSSL_NO_SHA512 */ + +#endif diff --git a/src/lib/libcrypto/evp/m_wp.c b/src/lib/libcrypto/evp/m_wp.c index 1ce47c040b..c51bc2d5d1 100644 --- a/src/lib/libcrypto/evp/m_wp.c +++ b/src/lib/libcrypto/evp/m_wp.c @@ -9,6 +9,7 @@ #include #include #include +#include "evp_locl.h" static int init(EVP_MD_CTX *ctx) { return WHIRLPOOL_Init(ctx->md_data); } diff --git a/src/lib/libcrypto/evp/names.c b/src/lib/libcrypto/evp/names.c index f2869f5c78..6311ad7cfb 100644 --- a/src/lib/libcrypto/evp/names.c +++ b/src/lib/libcrypto/evp/names.c @@ -66,6 +66,10 @@ int EVP_add_cipher(const EVP_CIPHER *c) { int r; + if (c == NULL) return 0; + + OPENSSL_init(); + r=OBJ_NAME_add(OBJ_nid2sn(c->nid),OBJ_NAME_TYPE_CIPHER_METH,(const char *)c); if (r == 0) return(0); check_defer(c->nid); @@ -78,6 +82,7 @@ int EVP_add_digest(const EVP_MD *md) { int r; const char *name; + OPENSSL_init(); name=OBJ_nid2sn(md->type); r=OBJ_NAME_add(name,OBJ_NAME_TYPE_MD_METH,(const char *)md); diff --git a/src/lib/libcrypto/evp/p5_crpt.c b/src/lib/libcrypto/evp/p5_crpt.c index 7ecfa8dad9..294cc90d87 100644 --- a/src/lib/libcrypto/evp/p5_crpt.c +++ b/src/lib/libcrypto/evp/p5_crpt.c @@ -82,6 +82,8 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen, unsigned char *salt; const unsigned char *pbuf; int mdsize; + int rv = 0; + EVP_MD_CTX_init(&ctx); /* Extract useful info from parameter */ if (param == NULL || param->type != V_ASN1_SEQUENCE || @@ -104,29 +106,38 @@ int PKCS5_PBE_keyivgen(EVP_CIPHER_CTX *cctx, const char *pass, int passlen, if(!pass) passlen = 0; else if(passlen == -1) passlen = strlen(pass); - EVP_MD_CTX_init(&ctx); - EVP_DigestInit_ex(&ctx, md, NULL); - EVP_DigestUpdate(&ctx, pass, passlen); - EVP_DigestUpdate(&ctx, salt, saltlen); + if (!EVP_DigestInit_ex(&ctx, md, NULL)) + goto err; + if (!EVP_DigestUpdate(&ctx, pass, passlen)) + goto err; + if (!EVP_DigestUpdate(&ctx, salt, saltlen)) + goto err; PBEPARAM_free(pbe); - EVP_DigestFinal_ex(&ctx, md_tmp, NULL); + if (!EVP_DigestFinal_ex(&ctx, md_tmp, NULL)) + goto err; mdsize = EVP_MD_size(md); if (mdsize < 0) return 0; for (i = 1; i < iter; i++) { - EVP_DigestInit_ex(&ctx, md, NULL); - EVP_DigestUpdate(&ctx, md_tmp, mdsize); - EVP_DigestFinal_ex (&ctx, md_tmp, NULL); + if (!EVP_DigestInit_ex(&ctx, md, NULL)) + goto err; + if (!EVP_DigestUpdate(&ctx, md_tmp, mdsize)) + goto err; + if (!EVP_DigestFinal_ex (&ctx, md_tmp, NULL)) + goto err; } - EVP_MD_CTX_cleanup(&ctx); OPENSSL_assert(EVP_CIPHER_key_length(cipher) <= (int)sizeof(md_tmp)); memcpy(key, md_tmp, EVP_CIPHER_key_length(cipher)); OPENSSL_assert(EVP_CIPHER_iv_length(cipher) <= 16); memcpy(iv, md_tmp + (16 - EVP_CIPHER_iv_length(cipher)), EVP_CIPHER_iv_length(cipher)); - EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de); + if (!EVP_CipherInit_ex(cctx, cipher, NULL, key, iv, en_de)) + goto err; OPENSSL_cleanse(md_tmp, EVP_MAX_MD_SIZE); OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH); OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH); - return 1; + rv = 1; + err: + EVP_MD_CTX_cleanup(&ctx); + return rv; } diff --git a/src/lib/libcrypto/evp/p5_crpt2.c b/src/lib/libcrypto/evp/p5_crpt2.c index 334379f310..975d004df4 100644 --- a/src/lib/libcrypto/evp/p5_crpt2.c +++ b/src/lib/libcrypto/evp/p5_crpt2.c @@ -62,6 +62,7 @@ #include #include #include +#include "evp_locl.h" /* set this to print out info about the keygen algorithm */ /* #define DEBUG_PKCS5V2 */ @@ -110,10 +111,14 @@ int PKCS5_PBKDF2_HMAC(const char *pass, int passlen, itmp[1] = (unsigned char)((i >> 16) & 0xff); itmp[2] = (unsigned char)((i >> 8) & 0xff); itmp[3] = (unsigned char)(i & 0xff); - HMAC_Init_ex(&hctx, pass, passlen, digest, NULL); - HMAC_Update(&hctx, salt, saltlen); - HMAC_Update(&hctx, itmp, 4); - HMAC_Final(&hctx, digtmp, NULL); + if (!HMAC_Init_ex(&hctx, pass, passlen, digest, NULL) + || !HMAC_Update(&hctx, salt, saltlen) + || !HMAC_Update(&hctx, itmp, 4) + || !HMAC_Final(&hctx, digtmp, NULL)) + { + HMAC_CTX_cleanup(&hctx); + return 0; + } memcpy(p, digtmp, cplen); for(j = 1; j < iter; j++) { @@ -168,27 +173,24 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen, ASN1_TYPE *param, const EVP_CIPHER *c, const EVP_MD *md, int en_de) { - unsigned char *salt, key[EVP_MAX_KEY_LENGTH]; const unsigned char *pbuf; - int saltlen, iter, plen; - unsigned int keylen; + int plen; PBE2PARAM *pbe2 = NULL; const EVP_CIPHER *cipher; - PBKDF2PARAM *kdf = NULL; - const EVP_MD *prfmd; - int prf_nid, hmac_md_nid; + + int rv = 0; if (param == NULL || param->type != V_ASN1_SEQUENCE || param->value.sequence == NULL) { EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR); - return 0; + goto err; } pbuf = param->value.sequence->data; plen = param->value.sequence->length; if(!(pbe2 = d2i_PBE2PARAM(NULL, &pbuf, plen))) { EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR); - return 0; + goto err; } /* See if we recognise the key derivation function */ @@ -211,38 +213,63 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen, } /* Fixup cipher based on AlgorithmIdentifier */ - EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de); + if (!EVP_CipherInit_ex(ctx, cipher, NULL, NULL, NULL, en_de)) + goto err; if(EVP_CIPHER_asn1_to_param(ctx, pbe2->encryption->parameter) < 0) { EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_CIPHER_PARAMETER_ERROR); goto err; } + rv = PKCS5_v2_PBKDF2_keyivgen(ctx, pass, passlen, + pbe2->keyfunc->parameter, c, md, en_de); + err: + PBE2PARAM_free(pbe2); + return rv; +} + +int PKCS5_v2_PBKDF2_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen, + ASN1_TYPE *param, + const EVP_CIPHER *c, const EVP_MD *md, int en_de) +{ + unsigned char *salt, key[EVP_MAX_KEY_LENGTH]; + const unsigned char *pbuf; + int saltlen, iter, plen; + int rv = 0; + unsigned int keylen = 0; + int prf_nid, hmac_md_nid; + PBKDF2PARAM *kdf = NULL; + const EVP_MD *prfmd; + + if (EVP_CIPHER_CTX_cipher(ctx) == NULL) + { + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_NO_CIPHER_SET); + goto err; + } keylen = EVP_CIPHER_CTX_key_length(ctx); OPENSSL_assert(keylen <= sizeof key); - /* Now decode key derivation function */ + /* Decode parameter */ - if(!pbe2->keyfunc->parameter || - (pbe2->keyfunc->parameter->type != V_ASN1_SEQUENCE)) + if(!param || (param->type != V_ASN1_SEQUENCE)) { - EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR); + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR); goto err; } - pbuf = pbe2->keyfunc->parameter->value.sequence->data; - plen = pbe2->keyfunc->parameter->value.sequence->length; + pbuf = param->value.sequence->data; + plen = param->value.sequence->length; + if(!(kdf = d2i_PBKDF2PARAM(NULL, &pbuf, plen)) ) { - EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN,EVP_R_DECODE_ERROR); + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN,EVP_R_DECODE_ERROR); goto err; } - PBE2PARAM_free(pbe2); - pbe2 = NULL; + keylen = EVP_CIPHER_CTX_key_length(ctx); /* Now check the parameters of the kdf */ if(kdf->keylength && (ASN1_INTEGER_get(kdf->keylength) != (int)keylen)){ - EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_KEYLENGTH); goto err; } @@ -254,19 +281,19 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen, if (!EVP_PBE_find(EVP_PBE_TYPE_PRF, prf_nid, NULL, &hmac_md_nid, 0)) { - EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF); + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF); goto err; } prfmd = EVP_get_digestbynid(hmac_md_nid); if (prfmd == NULL) { - EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, EVP_R_UNSUPPORTED_PRF); + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_PRF); goto err; } if(kdf->salt->type != V_ASN1_OCTET_STRING) { - EVPerr(EVP_F_PKCS5_V2_PBE_KEYIVGEN, + EVPerr(EVP_F_PKCS5_V2_PBKDF2_KEYIVGEN, EVP_R_UNSUPPORTED_SALT_TYPE); goto err; } @@ -278,15 +305,11 @@ int PKCS5_v2_PBE_keyivgen(EVP_CIPHER_CTX *ctx, const char *pass, int passlen, if(!PKCS5_PBKDF2_HMAC(pass, passlen, salt, saltlen, iter, prfmd, keylen, key)) goto err; - EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de); - OPENSSL_cleanse(key, keylen); - PBKDF2PARAM_free(kdf); - return 1; - + rv = EVP_CipherInit_ex(ctx, NULL, NULL, key, NULL, en_de); err: - PBE2PARAM_free(pbe2); + OPENSSL_cleanse(key, keylen); PBKDF2PARAM_free(kdf); - return 0; + return rv; } #ifdef DEBUG_PKCS5V2 diff --git a/src/lib/libcrypto/evp/p_open.c b/src/lib/libcrypto/evp/p_open.c index 53a59a295c..c748fbea87 100644 --- a/src/lib/libcrypto/evp/p_open.c +++ b/src/lib/libcrypto/evp/p_open.c @@ -115,7 +115,8 @@ int EVP_OpenFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl) int i; i=EVP_DecryptFinal_ex(ctx,out,outl); - EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL); + if (i) + i = EVP_DecryptInit_ex(ctx,NULL,NULL,NULL,NULL); return(i); } #else /* !OPENSSL_NO_RSA */ diff --git a/src/lib/libcrypto/evp/p_seal.c b/src/lib/libcrypto/evp/p_seal.c index d8324526e7..e5919b0fbf 100644 --- a/src/lib/libcrypto/evp/p_seal.c +++ b/src/lib/libcrypto/evp/p_seal.c @@ -110,6 +110,7 @@ int EVP_SealFinal(EVP_CIPHER_CTX *ctx, unsigned char *out, int *outl) { int i; i = EVP_EncryptFinal_ex(ctx,out,outl); - EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL); + if (i) + i = EVP_EncryptInit_ex(ctx,NULL,NULL,NULL,NULL); return i; } diff --git a/src/lib/libcrypto/evp/p_sign.c b/src/lib/libcrypto/evp/p_sign.c index bb893f5bde..dfa48c157c 100644 --- a/src/lib/libcrypto/evp/p_sign.c +++ b/src/lib/libcrypto/evp/p_sign.c @@ -80,18 +80,20 @@ int EVP_SignFinal(EVP_MD_CTX *ctx, unsigned char *sigret, unsigned int *siglen, { unsigned char m[EVP_MAX_MD_SIZE]; unsigned int m_len; - int i,ok=0,v; + int i=0,ok=0,v; EVP_MD_CTX tmp_ctx; + EVP_PKEY_CTX *pkctx = NULL; *siglen=0; EVP_MD_CTX_init(&tmp_ctx); - EVP_MD_CTX_copy_ex(&tmp_ctx,ctx); - EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len); + if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx)) + goto err; + if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len)) + goto err; EVP_MD_CTX_cleanup(&tmp_ctx); if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE) { - EVP_PKEY_CTX *pkctx = NULL; size_t sltmp = (size_t)EVP_PKEY_size(pkey); i = 0; pkctx = EVP_PKEY_CTX_new(pkey, NULL); diff --git a/src/lib/libcrypto/evp/p_verify.c b/src/lib/libcrypto/evp/p_verify.c index 41d4b67130..5f5c409f45 100644 --- a/src/lib/libcrypto/evp/p_verify.c +++ b/src/lib/libcrypto/evp/p_verify.c @@ -67,17 +67,19 @@ int EVP_VerifyFinal(EVP_MD_CTX *ctx, const unsigned char *sigbuf, { unsigned char m[EVP_MAX_MD_SIZE]; unsigned int m_len; - int i,ok=0,v; + int i=-1,ok=0,v; EVP_MD_CTX tmp_ctx; + EVP_PKEY_CTX *pkctx = NULL; EVP_MD_CTX_init(&tmp_ctx); - EVP_MD_CTX_copy_ex(&tmp_ctx,ctx); - EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len); + if (!EVP_MD_CTX_copy_ex(&tmp_ctx,ctx)) + goto err; + if (!EVP_DigestFinal_ex(&tmp_ctx,&(m[0]),&m_len)) + goto err; EVP_MD_CTX_cleanup(&tmp_ctx); if (ctx->digest->flags & EVP_MD_FLAG_PKEY_METHOD_SIGNATURE) { - EVP_PKEY_CTX *pkctx = NULL; i = -1; pkctx = EVP_PKEY_CTX_new(pkey, NULL); if (!pkctx) diff --git a/src/lib/libcrypto/evp/pmeth_gn.c b/src/lib/libcrypto/evp/pmeth_gn.c index 5d74161a09..4651c81370 100644 --- a/src/lib/libcrypto/evp/pmeth_gn.c +++ b/src/lib/libcrypto/evp/pmeth_gn.c @@ -199,7 +199,7 @@ int EVP_PKEY_CTX_get_keygen_info(EVP_PKEY_CTX *ctx, int idx) } EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, - unsigned char *key, int keylen) + const unsigned char *key, int keylen) { EVP_PKEY_CTX *mac_ctx = NULL; EVP_PKEY *mac_key = NULL; @@ -209,7 +209,8 @@ EVP_PKEY *EVP_PKEY_new_mac_key(int type, ENGINE *e, if (EVP_PKEY_keygen_init(mac_ctx) <= 0) goto merr; if (EVP_PKEY_CTX_ctrl(mac_ctx, -1, EVP_PKEY_OP_KEYGEN, - EVP_PKEY_CTRL_SET_MAC_KEY, keylen, key) <= 0) + EVP_PKEY_CTRL_SET_MAC_KEY, + keylen, (void *)key) <= 0) goto merr; if (EVP_PKEY_keygen(mac_ctx, &mac_key) <= 0) goto merr; diff --git a/src/lib/libcrypto/evp/pmeth_lib.c b/src/lib/libcrypto/evp/pmeth_lib.c index 5481d4b8a5..acfa7b6f87 100644 --- a/src/lib/libcrypto/evp/pmeth_lib.c +++ b/src/lib/libcrypto/evp/pmeth_lib.c @@ -73,7 +73,7 @@ DECLARE_STACK_OF(EVP_PKEY_METHOD) STACK_OF(EVP_PKEY_METHOD) *app_pkey_methods = NULL; extern const EVP_PKEY_METHOD rsa_pkey_meth, dh_pkey_meth, dsa_pkey_meth; -extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth; +extern const EVP_PKEY_METHOD ec_pkey_meth, hmac_pkey_meth, cmac_pkey_meth; static const EVP_PKEY_METHOD *standard_methods[] = { @@ -90,6 +90,7 @@ static const EVP_PKEY_METHOD *standard_methods[] = &ec_pkey_meth, #endif &hmac_pkey_meth, + &cmac_pkey_meth }; DECLARE_OBJ_BSEARCH_CMP_FN(const EVP_PKEY_METHOD *, const EVP_PKEY_METHOD *, @@ -203,6 +204,8 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) if (!pmeth) return NULL; + memset(pmeth, 0, sizeof(EVP_PKEY_METHOD)); + pmeth->pkey_id = id; pmeth->flags = flags | EVP_PKEY_FLAG_DYNAMIC; @@ -235,6 +238,56 @@ EVP_PKEY_METHOD* EVP_PKEY_meth_new(int id, int flags) return pmeth; } +void EVP_PKEY_meth_get0_info(int *ppkey_id, int *pflags, + const EVP_PKEY_METHOD *meth) + { + if (ppkey_id) + *ppkey_id = meth->pkey_id; + if (pflags) + *pflags = meth->flags; + } + +void EVP_PKEY_meth_copy(EVP_PKEY_METHOD *dst, const EVP_PKEY_METHOD *src) + { + + dst->init = src->init; + dst->copy = src->copy; + dst->cleanup = src->cleanup; + + dst->paramgen_init = src->paramgen_init; + dst->paramgen = src->paramgen; + + dst->keygen_init = src->keygen_init; + dst->keygen = src->keygen; + + dst->sign_init = src->sign_init; + dst->sign = src->sign; + + dst->verify_init = src->verify_init; + dst->verify = src->verify; + + dst->verify_recover_init = src->verify_recover_init; + dst->verify_recover = src->verify_recover; + + dst->signctx_init = src->signctx_init; + dst->signctx = src->signctx; + + dst->verifyctx_init = src->verifyctx_init; + dst->verifyctx = src->verifyctx; + + dst->encrypt_init = src->encrypt_init; + dst->encrypt = src->encrypt; + + dst->decrypt_init = src->decrypt_init; + dst->decrypt = src->decrypt; + + dst->derive_init = src->derive_init; + dst->derive = src->derive; + + dst->ctrl = src->ctrl; + dst->ctrl_str = src->ctrl_str; + } + void EVP_PKEY_meth_free(EVP_PKEY_METHOD *pmeth) { if (pmeth && (pmeth->flags & EVP_PKEY_FLAG_DYNAMIC)) diff --git a/src/lib/libcrypto/hmac/hm_ameth.c b/src/lib/libcrypto/hmac/hm_ameth.c index 6d8a89149e..e03f24aeda 100644 --- a/src/lib/libcrypto/hmac/hm_ameth.c +++ b/src/lib/libcrypto/hmac/hm_ameth.c @@ -153,7 +153,7 @@ const EVP_PKEY_ASN1_METHOD hmac_asn1_meth = hmac_size, 0, - 0,0,0,0,0,0, + 0,0,0,0,0,0,0, hmac_key_free, hmac_pkey_ctrl, diff --git a/src/lib/libcrypto/hmac/hm_pmeth.c b/src/lib/libcrypto/hmac/hm_pmeth.c index 71e8567a14..0daa44511d 100644 --- a/src/lib/libcrypto/hmac/hm_pmeth.c +++ b/src/lib/libcrypto/hmac/hm_pmeth.c @@ -100,7 +100,8 @@ static int pkey_hmac_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) dctx = dst->data; dctx->md = sctx->md; HMAC_CTX_init(&dctx->ctx); - HMAC_CTX_copy(&dctx->ctx, &sctx->ctx); + if (!HMAC_CTX_copy(&dctx->ctx, &sctx->ctx)) + return 0; if (sctx->ktmp.data) { if (!ASN1_OCTET_STRING_set(&dctx->ktmp, @@ -141,7 +142,8 @@ static int pkey_hmac_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) static int int_update(EVP_MD_CTX *ctx,const void *data,size_t count) { HMAC_PKEY_CTX *hctx = ctx->pctx->data; - HMAC_Update(&hctx->ctx, data, count); + if (!HMAC_Update(&hctx->ctx, data, count)) + return 0; return 1; } @@ -167,7 +169,8 @@ static int hmac_signctx(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, if (!sig) return 1; - HMAC_Final(&hctx->ctx, sig, &hlen); + if (!HMAC_Final(&hctx->ctx, sig, &hlen)) + return 0; *siglen = (size_t)hlen; return 1; } @@ -192,8 +195,9 @@ static int pkey_hmac_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) case EVP_PKEY_CTRL_DIGESTINIT: key = (ASN1_OCTET_STRING *)ctx->pkey->pkey.ptr; - HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, - ctx->engine); + if (!HMAC_Init_ex(&hctx->ctx, key->data, key->length, hctx->md, + ctx->engine)) + return 0; break; default: diff --git a/src/lib/libcrypto/hmac/hmac.c b/src/lib/libcrypto/hmac/hmac.c index 6c98fc43a3..ba27cbf56f 100644 --- a/src/lib/libcrypto/hmac/hmac.c +++ b/src/lib/libcrypto/hmac/hmac.c @@ -61,12 +61,34 @@ #include "cryptlib.h" #include +#ifdef OPENSSL_FIPS +#include +#endif + int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, int len, const EVP_MD *md, ENGINE *impl) { int i,j,reset=0; unsigned char pad[HMAC_MAX_MD_CBLOCK]; +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + { + /* If we have an ENGINE need to allow non FIPS */ + if ((impl || ctx->i_ctx.engine) + && !(ctx->i_ctx.flags & EVP_CIPH_FLAG_NON_FIPS_ALLOW)) + { + EVPerr(EVP_F_HMAC_INIT_EX, EVP_R_DISABLED_FOR_FIPS); + return 0; + } + /* Other algorithm blocking will be done in FIPS_cmac_init, + * via FIPS_hmac_init_ex(). + */ + if (!impl && !ctx->i_ctx.engine) + return FIPS_hmac_init_ex(ctx, key, len, md, NULL); + } +#endif + if (md != NULL) { reset=1; @@ -133,6 +155,10 @@ int HMAC_Init(HMAC_CTX *ctx, const void *key, int len, const EVP_MD *md) int HMAC_Update(HMAC_CTX *ctx, const unsigned char *data, size_t len) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->i_ctx.engine) + return FIPS_hmac_update(ctx, data, len); +#endif return EVP_DigestUpdate(&ctx->md_ctx,data,len); } @@ -140,6 +166,10 @@ int HMAC_Final(HMAC_CTX *ctx, unsigned char *md, unsigned int *len) { unsigned int i; unsigned char buf[EVP_MAX_MD_SIZE]; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->i_ctx.engine) + return FIPS_hmac_final(ctx, md, len); +#endif if (!EVP_DigestFinal_ex(&ctx->md_ctx,buf,&i)) goto err; @@ -179,6 +209,13 @@ int HMAC_CTX_copy(HMAC_CTX *dctx, HMAC_CTX *sctx) void HMAC_CTX_cleanup(HMAC_CTX *ctx) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !ctx->i_ctx.engine) + { + FIPS_hmac_ctx_cleanup(ctx); + return; + } +#endif EVP_MD_CTX_cleanup(&ctx->i_ctx); EVP_MD_CTX_cleanup(&ctx->o_ctx); EVP_MD_CTX_cleanup(&ctx->md_ctx); diff --git a/src/lib/libcrypto/ia64cpuid.S b/src/lib/libcrypto/ia64cpuid.S index d705fff7ee..7832b9b640 100644 --- a/src/lib/libcrypto/ia64cpuid.S +++ b/src/lib/libcrypto/ia64cpuid.S @@ -26,7 +26,7 @@ OPENSSL_atomic_add: { .mii; mov ar.ccv=r2 add r8=r2,r33 mov r3=r2 };; -{ .mmi; mf +{ .mmi; mf;; cmpxchg4.acq r2=[r32],r8,ar.ccv nop.i 0 };; { .mib; cmp.ne p6,p0=r2,r3 diff --git a/src/lib/libcrypto/idea/i_cbc.c b/src/lib/libcrypto/idea/i_cbc.c new file mode 100644 index 0000000000..ecb9cb8b83 --- /dev/null +++ b/src/lib/libcrypto/idea/i_cbc.c @@ -0,0 +1,168 @@ +/* crypto/idea/i_cbc.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include "idea_lcl.h" + +void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, long length, + IDEA_KEY_SCHEDULE *ks, unsigned char *iv, int encrypt) + { + register unsigned long tin0,tin1; + register unsigned long tout0,tout1,xor0,xor1; + register long l=length; + unsigned long tin[2]; + + if (encrypt) + { + n2l(iv,tout0); + n2l(iv,tout1); + iv-=8; + for (l-=8; l>=0; l-=8) + { + n2l(in,tin0); + n2l(in,tin1); + tin0^=tout0; + tin1^=tout1; + tin[0]=tin0; + tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]; l2n(tout0,out); + tout1=tin[1]; l2n(tout1,out); + } + if (l != -8) + { + n2ln(in,tin0,tin1,l+8); + tin0^=tout0; + tin1^=tout1; + tin[0]=tin0; + tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]; l2n(tout0,out); + tout1=tin[1]; l2n(tout1,out); + } + l2n(tout0,iv); + l2n(tout1,iv); + } + else + { + n2l(iv,xor0); + n2l(iv,xor1); + iv-=8; + for (l-=8; l>=0; l-=8) + { + n2l(in,tin0); tin[0]=tin0; + n2l(in,tin1); tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]^xor0; + tout1=tin[1]^xor1; + l2n(tout0,out); + l2n(tout1,out); + xor0=tin0; + xor1=tin1; + } + if (l != -8) + { + n2l(in,tin0); tin[0]=tin0; + n2l(in,tin1); tin[1]=tin1; + idea_encrypt(tin,ks); + tout0=tin[0]^xor0; + tout1=tin[1]^xor1; + l2nn(tout0,tout1,out,l+8); + xor0=tin0; + xor1=tin1; + } + l2n(xor0,iv); + l2n(xor1,iv); + } + tin0=tin1=tout0=tout1=xor0=xor1=0; + tin[0]=tin[1]=0; + } + +void idea_encrypt(unsigned long *d, IDEA_KEY_SCHEDULE *key) + { + register IDEA_INT *p; + register unsigned long x1,x2,x3,x4,t0,t1,ul; + + x2=d[0]; + x1=(x2>>16); + x4=d[1]; + x3=(x4>>16); + + p= &(key->data[0][0]); + + E_IDEA(0); + E_IDEA(1); + E_IDEA(2); + E_IDEA(3); + E_IDEA(4); + E_IDEA(5); + E_IDEA(6); + E_IDEA(7); + + x1&=0xffff; + idea_mul(x1,x1,*p,ul); p++; + + t0= x3+ *(p++); + t1= x2+ *(p++); + + x4&=0xffff; + idea_mul(x4,x4,*p,ul); + + d[0]=(t0&0xffff)|((x1&0xffff)<<16); + d[1]=(x4&0xffff)|((t1&0xffff)<<16); + } diff --git a/src/lib/libcrypto/idea/i_cfb64.c b/src/lib/libcrypto/idea/i_cfb64.c new file mode 100644 index 0000000000..66d49d520e --- /dev/null +++ b/src/lib/libcrypto/idea/i_cfb64.c @@ -0,0 +1,122 @@ +/* crypto/idea/i_cfb64.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include "idea_lcl.h" + +/* The input and output encrypted as though 64bit cfb mode is being + * used. The extra state information to record how much of the + * 64bit block we have used is contained in *num; + */ + +void idea_cfb64_encrypt(const unsigned char *in, unsigned char *out, + long length, IDEA_KEY_SCHEDULE *schedule, + unsigned char *ivec, int *num, int encrypt) + { + register unsigned long v0,v1,t; + register int n= *num; + register long l=length; + unsigned long ti[2]; + unsigned char *iv,c,cc; + + iv=(unsigned char *)ivec; + if (encrypt) + { + while (l--) + { + if (n == 0) + { + n2l(iv,v0); ti[0]=v0; + n2l(iv,v1); ti[1]=v1; + idea_encrypt((unsigned long *)ti,schedule); + iv=(unsigned char *)ivec; + t=ti[0]; l2n(t,iv); + t=ti[1]; l2n(t,iv); + iv=(unsigned char *)ivec; + } + c= *(in++)^iv[n]; + *(out++)=c; + iv[n]=c; + n=(n+1)&0x07; + } + } + else + { + while (l--) + { + if (n == 0) + { + n2l(iv,v0); ti[0]=v0; + n2l(iv,v1); ti[1]=v1; + idea_encrypt((unsigned long *)ti,schedule); + iv=(unsigned char *)ivec; + t=ti[0]; l2n(t,iv); + t=ti[1]; l2n(t,iv); + iv=(unsigned char *)ivec; + } + cc= *(in++); + c=iv[n]; + iv[n]=cc; + *(out++)=c^cc; + n=(n+1)&0x07; + } + } + v0=v1=ti[0]=ti[1]=t=c=cc=0; + *num=n; + } + diff --git a/src/lib/libcrypto/idea/i_ecb.c b/src/lib/libcrypto/idea/i_ecb.c new file mode 100644 index 0000000000..fef38230a7 --- /dev/null +++ b/src/lib/libcrypto/idea/i_ecb.c @@ -0,0 +1,85 @@ +/* crypto/idea/i_ecb.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include "idea_lcl.h" +#include + +const char IDEA_version[]="IDEA" OPENSSL_VERSION_PTEXT; + +const char *idea_options(void) + { + if (sizeof(short) != sizeof(IDEA_INT)) + return("idea(int)"); + else + return("idea(short)"); + } + +void idea_ecb_encrypt(const unsigned char *in, unsigned char *out, + IDEA_KEY_SCHEDULE *ks) + { + unsigned long l0,l1,d[2]; + + n2l(in,l0); d[0]=l0; + n2l(in,l1); d[1]=l1; + idea_encrypt(d,ks); + l0=d[0]; l2n(l0,out); + l1=d[1]; l2n(l1,out); + l0=l1=d[0]=d[1]=0; + } + diff --git a/src/lib/libcrypto/idea/i_ofb64.c b/src/lib/libcrypto/idea/i_ofb64.c new file mode 100644 index 0000000000..e749e88e34 --- /dev/null +++ b/src/lib/libcrypto/idea/i_ofb64.c @@ -0,0 +1,111 @@ +/* crypto/idea/i_ofb64.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include "idea_lcl.h" + +/* The input and output encrypted as though 64bit ofb mode is being + * used. The extra state information to record how much of the + * 64bit block we have used is contained in *num; + */ +void idea_ofb64_encrypt(const unsigned char *in, unsigned char *out, + long length, IDEA_KEY_SCHEDULE *schedule, + unsigned char *ivec, int *num) + { + register unsigned long v0,v1,t; + register int n= *num; + register long l=length; + unsigned char d[8]; + register char *dp; + unsigned long ti[2]; + unsigned char *iv; + int save=0; + + iv=(unsigned char *)ivec; + n2l(iv,v0); + n2l(iv,v1); + ti[0]=v0; + ti[1]=v1; + dp=(char *)d; + l2n(v0,dp); + l2n(v1,dp); + while (l--) + { + if (n == 0) + { + idea_encrypt((unsigned long *)ti,schedule); + dp=(char *)d; + t=ti[0]; l2n(t,dp); + t=ti[1]; l2n(t,dp); + save++; + } + *(out++)= *(in++)^d[n]; + n=(n+1)&0x07; + } + if (save) + { + v0=ti[0]; + v1=ti[1]; + iv=(unsigned char *)ivec; + l2n(v0,iv); + l2n(v1,iv); + } + t=v0=v1=ti[0]=ti[1]=0; + *num=n; + } + diff --git a/src/lib/libcrypto/idea/i_skey.c b/src/lib/libcrypto/idea/i_skey.c new file mode 100644 index 0000000000..afb830964d --- /dev/null +++ b/src/lib/libcrypto/idea/i_skey.c @@ -0,0 +1,164 @@ +/* crypto/idea/i_skey.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include +#include "idea_lcl.h" + +static IDEA_INT inverse(unsigned int xin); +void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(IDEA); + private_idea_set_encrypt_key(key, ks); + } +void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks) +#endif + { + int i; + register IDEA_INT *kt,*kf,r0,r1,r2; + + kt= &(ks->data[0][0]); + n2s(key,kt[0]); n2s(key,kt[1]); n2s(key,kt[2]); n2s(key,kt[3]); + n2s(key,kt[4]); n2s(key,kt[5]); n2s(key,kt[6]); n2s(key,kt[7]); + + kf=kt; + kt+=8; + for (i=0; i<6; i++) + { + r2= kf[1]; + r1= kf[2]; + *(kt++)= ((r2<<9) | (r1>>7))&0xffff; + r0= kf[3]; + *(kt++)= ((r1<<9) | (r0>>7))&0xffff; + r1= kf[4]; + *(kt++)= ((r0<<9) | (r1>>7))&0xffff; + r0= kf[5]; + *(kt++)= ((r1<<9) | (r0>>7))&0xffff; + r1= kf[6]; + *(kt++)= ((r0<<9) | (r1>>7))&0xffff; + r0= kf[7]; + *(kt++)= ((r1<<9) | (r0>>7))&0xffff; + r1= kf[0]; + if (i >= 5) break; + *(kt++)= ((r0<<9) | (r1>>7))&0xffff; + *(kt++)= ((r1<<9) | (r2>>7))&0xffff; + kf+=8; + } + } + +void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk) + { + int r; + register IDEA_INT *fp,*tp,t; + + tp= &(dk->data[0][0]); + fp= &(ek->data[8][0]); + for (r=0; r<9; r++) + { + *(tp++)=inverse(fp[0]); + *(tp++)=((int)(0x10000L-fp[2])&0xffff); + *(tp++)=((int)(0x10000L-fp[1])&0xffff); + *(tp++)=inverse(fp[3]); + if (r == 8) break; + fp-=6; + *(tp++)=fp[4]; + *(tp++)=fp[5]; + } + + tp= &(dk->data[0][0]); + t=tp[1]; + tp[1]=tp[2]; + tp[2]=t; + + t=tp[49]; + tp[49]=tp[50]; + tp[50]=t; + } + +/* taken directly from the 'paper' I'll have a look at it later */ +static IDEA_INT inverse(unsigned int xin) + { + long n1,n2,q,r,b1,b2,t; + + if (xin == 0) + b2=0; + else + { + n1=0x10001; + n2=xin; + b2=1; + b1=0; + + do { + r=(n1%n2); + q=(n1-r)/n2; + if (r == 0) + { if (b2 < 0) b2=0x10001+b2; } + else + { + n1=n2; + n2=r; + t=b2; + b2=b1-q*b2; + b1=t; + } + } while (r != 0); + } + return((IDEA_INT)b2); + } diff --git a/src/lib/libcrypto/idea/idea.h b/src/lib/libcrypto/idea/idea.h index 5782e54b0f..e9a1e7f1a5 100644 --- a/src/lib/libcrypto/idea/idea.h +++ b/src/lib/libcrypto/idea/idea.h @@ -83,6 +83,9 @@ typedef struct idea_key_st const char *idea_options(void); void idea_ecb_encrypt(const unsigned char *in, unsigned char *out, IDEA_KEY_SCHEDULE *ks); +#ifdef OPENSSL_FIPS +void private_idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks); +#endif void idea_set_encrypt_key(const unsigned char *key, IDEA_KEY_SCHEDULE *ks); void idea_set_decrypt_key(IDEA_KEY_SCHEDULE *ek, IDEA_KEY_SCHEDULE *dk); void idea_cbc_encrypt(const unsigned char *in, unsigned char *out, diff --git a/src/lib/libcrypto/idea/idea_lcl.h b/src/lib/libcrypto/idea/idea_lcl.h new file mode 100644 index 0000000000..f3dbfa67e9 --- /dev/null +++ b/src/lib/libcrypto/idea/idea_lcl.h @@ -0,0 +1,215 @@ +/* crypto/idea/idea_lcl.h */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +/* The new form of this macro (check if the a*b == 0) was suggested by + * Colin Plumb */ +/* Removal of the inner if from from Wei Dai 24/4/96 */ +#define idea_mul(r,a,b,ul) \ +ul=(unsigned long)a*b; \ +if (ul != 0) \ + { \ + r=(ul&0xffff)-(ul>>16); \ + r-=((r)>>16); \ + } \ +else \ + r=(-(int)a-b+1); /* assuming a or b is 0 and in range */ + +#ifdef undef +#define idea_mul(r,a,b,ul,sl) \ +if (a == 0) r=(0x10001-b)&0xffff; \ +else if (b == 0) r=(0x10001-a)&0xffff; \ +else { \ + ul=(unsigned long)a*b; \ + sl=(ul&0xffff)-(ul>>16); \ + if (sl <= 0) sl+=0x10001; \ + r=sl; \ + } +#endif + +/* 7/12/95 - Many thanks to Rhys Weatherley + * for pointing out that I was assuming little endian + * byte order for all quantities what idea + * actually used bigendian. No where in the spec does it mention + * this, it is all in terms of 16 bit numbers and even the example + * does not use byte streams for the input example :-(. + * If you byte swap each pair of input, keys and iv, the functions + * would produce the output as the old version :-(. + */ + +/* NOTE - c is not incremented as per n2l */ +#define n2ln(c,l1,l2,n) { \ + c+=n; \ + l1=l2=0; \ + switch (n) { \ + case 8: l2 =((unsigned long)(*(--(c)))) ; \ + case 7: l2|=((unsigned long)(*(--(c))))<< 8; \ + case 6: l2|=((unsigned long)(*(--(c))))<<16; \ + case 5: l2|=((unsigned long)(*(--(c))))<<24; \ + case 4: l1 =((unsigned long)(*(--(c)))) ; \ + case 3: l1|=((unsigned long)(*(--(c))))<< 8; \ + case 2: l1|=((unsigned long)(*(--(c))))<<16; \ + case 1: l1|=((unsigned long)(*(--(c))))<<24; \ + } \ + } + +/* NOTE - c is not incremented as per l2n */ +#define l2nn(l1,l2,c,n) { \ + c+=n; \ + switch (n) { \ + case 8: *(--(c))=(unsigned char)(((l2) )&0xff); \ + case 7: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ + case 6: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ + case 5: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ + case 4: *(--(c))=(unsigned char)(((l1) )&0xff); \ + case 3: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ + case 2: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ + case 1: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ + } \ + } + +#undef n2l +#define n2l(c,l) (l =((unsigned long)(*((c)++)))<<24L, \ + l|=((unsigned long)(*((c)++)))<<16L, \ + l|=((unsigned long)(*((c)++)))<< 8L, \ + l|=((unsigned long)(*((c)++)))) + +#undef l2n +#define l2n(l,c) (*((c)++)=(unsigned char)(((l)>>24L)&0xff), \ + *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ + *((c)++)=(unsigned char)(((l) )&0xff)) + +#undef s2n +#define s2n(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) + +#undef n2s +#define n2s(c,l) (l =((IDEA_INT)(*((c)++)))<< 8L, \ + l|=((IDEA_INT)(*((c)++))) ) + +#ifdef undef +/* NOTE - c is not incremented as per c2l */ +#define c2ln(c,l1,l2,n) { \ + c+=n; \ + l1=l2=0; \ + switch (n) { \ + case 8: l2 =((unsigned long)(*(--(c))))<<24; \ + case 7: l2|=((unsigned long)(*(--(c))))<<16; \ + case 6: l2|=((unsigned long)(*(--(c))))<< 8; \ + case 5: l2|=((unsigned long)(*(--(c)))); \ + case 4: l1 =((unsigned long)(*(--(c))))<<24; \ + case 3: l1|=((unsigned long)(*(--(c))))<<16; \ + case 2: l1|=((unsigned long)(*(--(c))))<< 8; \ + case 1: l1|=((unsigned long)(*(--(c)))); \ + } \ + } + +/* NOTE - c is not incremented as per l2c */ +#define l2cn(l1,l2,c,n) { \ + c+=n; \ + switch (n) { \ + case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ + case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ + case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ + case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ + case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ + case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ + case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ + case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ + } \ + } + +#undef c2s +#define c2s(c,l) (l =((unsigned long)(*((c)++))) , \ + l|=((unsigned long)(*((c)++)))<< 8L) + +#undef s2c +#define s2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff)) + +#undef c2l +#define c2l(c,l) (l =((unsigned long)(*((c)++))) , \ + l|=((unsigned long)(*((c)++)))<< 8L, \ + l|=((unsigned long)(*((c)++)))<<16L, \ + l|=((unsigned long)(*((c)++)))<<24L) + +#undef l2c +#define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8L)&0xff), \ + *((c)++)=(unsigned char)(((l)>>16L)&0xff), \ + *((c)++)=(unsigned char)(((l)>>24L)&0xff)) +#endif + +#define E_IDEA(num) \ + x1&=0xffff; \ + idea_mul(x1,x1,*p,ul); p++; \ + x2+= *(p++); \ + x3+= *(p++); \ + x4&=0xffff; \ + idea_mul(x4,x4,*p,ul); p++; \ + t0=(x1^x3)&0xffff; \ + idea_mul(t0,t0,*p,ul); p++; \ + t1=(t0+(x2^x4))&0xffff; \ + idea_mul(t1,t1,*p,ul); p++; \ + t0+=t1; \ + x1^=t1; \ + x4^=t0; \ + ul=x2^t0; /* do the swap to x3 */ \ + x2=x3^t1; \ + x3=ul; + diff --git a/src/lib/libcrypto/md4/md4.h b/src/lib/libcrypto/md4/md4.h index c3ed9b3f75..a55368a790 100644 --- a/src/lib/libcrypto/md4/md4.h +++ b/src/lib/libcrypto/md4/md4.h @@ -105,6 +105,9 @@ typedef struct MD4state_st unsigned int num; } MD4_CTX; +#ifdef OPENSSL_FIPS +int private_MD4_Init(MD4_CTX *c); +#endif int MD4_Init(MD4_CTX *c); int MD4_Update(MD4_CTX *c, const void *data, size_t len); int MD4_Final(unsigned char *md, MD4_CTX *c); diff --git a/src/lib/libcrypto/md4/md4_dgst.c b/src/lib/libcrypto/md4/md4_dgst.c index e0c42e8596..82c2cb2d98 100644 --- a/src/lib/libcrypto/md4/md4_dgst.c +++ b/src/lib/libcrypto/md4/md4_dgst.c @@ -57,8 +57,9 @@ */ #include -#include "md4_locl.h" #include +#include +#include "md4_locl.h" const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT; @@ -70,7 +71,7 @@ const char MD4_version[]="MD4" OPENSSL_VERSION_PTEXT; #define INIT_DATA_C (unsigned long)0x98badcfeL #define INIT_DATA_D (unsigned long)0x10325476L -int MD4_Init(MD4_CTX *c) +fips_md_init(MD4) { memset (c,0,sizeof(*c)); c->A=INIT_DATA_A; diff --git a/src/lib/libcrypto/md5/md5.h b/src/lib/libcrypto/md5/md5.h index 4cbf84386b..541cc925fe 100644 --- a/src/lib/libcrypto/md5/md5.h +++ b/src/lib/libcrypto/md5/md5.h @@ -105,6 +105,9 @@ typedef struct MD5state_st unsigned int num; } MD5_CTX; +#ifdef OPENSSL_FIPS +int private_MD5_Init(MD5_CTX *c); +#endif int MD5_Init(MD5_CTX *c); int MD5_Update(MD5_CTX *c, const void *data, size_t len); int MD5_Final(unsigned char *md, MD5_CTX *c); diff --git a/src/lib/libcrypto/md5/md5_dgst.c b/src/lib/libcrypto/md5/md5_dgst.c index beace632e3..265890de52 100644 --- a/src/lib/libcrypto/md5/md5_dgst.c +++ b/src/lib/libcrypto/md5/md5_dgst.c @@ -59,6 +59,7 @@ #include #include "md5_locl.h" #include +#include const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT; @@ -70,7 +71,7 @@ const char MD5_version[]="MD5" OPENSSL_VERSION_PTEXT; #define INIT_DATA_C (unsigned long)0x98badcfeL #define INIT_DATA_D (unsigned long)0x10325476L -int MD5_Init(MD5_CTX *c) +fips_md_init(MD5) { memset (c,0,sizeof(*c)); c->A=INIT_DATA_A; diff --git a/src/lib/libcrypto/modes/asm/ghash-alpha.pl b/src/lib/libcrypto/modes/asm/ghash-alpha.pl new file mode 100644 index 0000000000..6358b2750f --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-alpha.pl @@ -0,0 +1,451 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Even though +# loops are aggressively modulo-scheduled in respect to references to +# Htbl and Z.hi updates for 8 cycles per byte, measured performance is +# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic +# scheduling "glitch," because uprofile(1) indicates uniform sample +# distribution, as if all instruction bundles execute in 1.5 cycles. +# Meaning that it could have been even faster, yet 12 cycles is ~60% +# better than gcc-generated code and ~80% than code generated by vendor +# compiler. + +$cnt="v0"; # $0 +$t0="t0"; +$t1="t1"; +$t2="t2"; +$Thi0="t3"; # $4 +$Tlo0="t4"; +$Thi1="t5"; +$Tlo1="t6"; +$rem="t7"; # $8 +################# +$Xi="a0"; # $16, input argument block +$Htbl="a1"; +$inp="a2"; +$len="a3"; +$nlo="a4"; # $20 +$nhi="a5"; +$Zhi="t8"; +$Zlo="t9"; +$Xhi="t10"; # $24 +$Xlo="t11"; +$remp="t12"; +$rem_4bit="AT"; # $28 + +{ my $N; + sub loop() { + + $N++; +$code.=<<___; +.align 4 + extbl $Xlo,7,$nlo + and $nlo,0xf0,$nhi + sll $nlo,4,$nlo + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Zlo,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Zhi,0($nlo) + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,6(zero) + extbl $Xlo,6,$nlo + + ldq $Tlo1,8($nhi) + s8addq $remp,$rem_4bit,$remp + ldq $Thi1,0($nhi) + srl $Zlo,4,$Zlo + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + and $nlo,0xf0,$nhi + + xor $Tlo1,$Zlo,$Zlo + sll $nlo,4,$nlo + xor $Thi1,$Zhi,$Zhi + and $nlo,0xf0,$nlo + + addq $nlo,$Htbl,$nlo + ldq $Tlo0,8($nlo) + addq $nhi,$Htbl,$nhi + ldq $Thi0,0($nlo) + +.Looplo$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xlo,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Looplo$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + lda $cnt,7(zero) + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + unop + + +.Loophi$N: + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + subq $cnt,1,$cnt + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + extbl $Xhi,$cnt,$nlo + + and $nlo,0xf0,$nhi + xor $Thi0,$Zhi,$Zhi + xor $Tlo0,$Zlo,$Zlo + sll $nlo,4,$nlo + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + and $nlo,0xf0,$nlo + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + addq $nlo,$Htbl,$nlo + addq $nhi,$Htbl,$nhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + ldq $Tlo0,8($nlo) + xor $t0,$Zlo,$Zlo + + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + ldq $Thi0,0($nlo) + bne $cnt,.Loophi$N + + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + ldq $Tlo1,8($nhi) + xor $rem,$Zhi,$Zhi + ldq $Thi1,0($nhi) + s8addq $remp,$rem_4bit,$remp + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $t0,$Zlo,$Zlo + + xor $Tlo0,$Zlo,$Zlo + xor $Thi0,$Zhi,$Zhi + + and $Zlo,0x0f,$remp + sll $Zhi,60,$t0 + srl $Zlo,4,$Zlo + + s8addq $remp,$rem_4bit,$remp + xor $rem,$Zhi,$Zhi + + ldq $rem,0($remp) + srl $Zhi,4,$Zhi + xor $Tlo1,$Zlo,$Zlo + xor $Thi1,$Zhi,$Zhi + xor $t0,$Zlo,$Zlo + xor $rem,$Zhi,$Zhi +___ +}} + +$code=<<___; +#ifdef __linux__ +#include +#else +#include +#include +#endif + +.text + +.set noat +.set noreorder +.globl gcm_gmult_4bit +.align 4 +.ent gcm_gmult_4bit +gcm_gmult_4bit: + .frame sp,0,ra + .prologue 0 + + ldq $Xlo,8($Xi) + ldq $Xhi,0($Xi) + + br $rem_4bit,.Lpic1 +.Lpic1: lda $rem_4bit,rem_4bit-.Lpic1($rem_4bit) +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + ret (ra) +.end gcm_gmult_4bit +___ + +$inhi="s0"; +$inlo="s1"; + +$code.=<<___; +.globl gcm_ghash_4bit +.align 4 +.ent gcm_ghash_4bit +gcm_ghash_4bit: + lda sp,-32(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + .mask 0x04000600,-32 + .frame sp,32,ra + .prologue 0 + + ldq_u $inhi,0($inp) + ldq_u $Thi0,7($inp) + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + ldq $Xhi,0($Xi) + ldq $Xlo,8($Xi) + + br $rem_4bit,.Lpic2 +.Lpic2: lda $rem_4bit,rem_4bit-.Lpic2($rem_4bit) + +.Louter: + extql $inhi,$inp,$inhi + extqh $Thi0,$inp,$Thi0 + or $inhi,$Thi0,$inhi + lda $inp,16($inp) + + extql $inlo,$inp,$inlo + extqh $Tlo0,$inp,$Tlo0 + or $inlo,$Tlo0,$inlo + subq $len,16,$len + + xor $Xlo,$inlo,$Xlo + xor $Xhi,$inhi,$Xhi +___ + + &loop(); + +$code.=<<___; + srl $Zlo,24,$t0 # byte swap + srl $Zlo,8,$t1 + + sll $Zlo,8,$t2 + sll $Zlo,24,$Zlo + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + + zapnot $Zlo,0x88,$Zlo + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zlo,$t0,$Zlo + srl $Zhi,24,$t0 + srl $Zhi,8,$t1 + + or $Zlo,$t2,$Zlo + sll $Zhi,8,$t2 + sll $Zhi,24,$Zhi + + srl $Zlo,32,$Xlo + sll $Zlo,32,$Zlo + beq $len,.Ldone + + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + ldq_u $inhi,0($inp) + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + ldq_u $Thi0,7($inp) + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + ldq_u $inlo,8($inp) + ldq_u $Tlo0,15($inp) + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + br zero,.Louter + +.Ldone: + zapnot $t0,0x11,$t0 + zapnot $t1,0x22,$t1 + or $Zlo,$Xlo,$Xlo + + zapnot $Zhi,0x88,$Zhi + or $t0,$t1,$t0 + zapnot $t2,0x44,$t2 + + or $Zhi,$t0,$Zhi + or $Zhi,$t2,$Zhi + + srl $Zhi,32,$Xhi + sll $Zhi,32,$Zhi + + or $Zhi,$Xhi,$Xhi + + stq $Xlo,8($Xi) + stq $Xhi,0($Xi) + + .set noreorder + /*ldq ra,0(sp)*/ + ldq s0,8(sp) + ldq s1,16(sp) + lda sp,32(sp) + ret (ra) +.end gcm_ghash_4bit + +.align 4 +rem_4bit: + .quad 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + .quad 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + .quad 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + .quad 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.ascii "GHASH for Alpha, CRYPTOGAMS by " +.align 4 + +___ +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; + diff --git a/src/lib/libcrypto/modes/asm/ghash-armv4.pl b/src/lib/libcrypto/modes/asm/ghash-armv4.pl new file mode 100644 index 0000000000..d91586ee29 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-armv4.pl @@ -0,0 +1,429 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+32 bytes shared table]. There is no +# experimental performance data available yet. The only approximation +# that can be made at this point is based on code size. Inner loop is +# 32 instructions long and on single-issue core should execute in <40 +# cycles. Having verified that gcc 3.4 didn't unroll corresponding +# loop, this assembler loop body was found to be ~3x smaller than +# compiler-generated one... +# +# July 2010 +# +# Rescheduling for dual-issue pipeline resulted in 8.5% improvement on +# Cortex A8 core and ~25 cycles per processed byte (which was observed +# to be ~3 times faster than gcc-generated code:-) +# +# February 2011 +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Cortex A8 core and ~23.5 cycles per byte. +# +# March 2011 +# +# Add NEON implementation featuring polynomial multiplication, i.e. no +# lookup tables involved. On Cortex A8 it was measured to process one +# byte in 15 cycles or 55% faster than integer-only code. + +# ==================================================================== +# Note about "528B" variant. In ARM case it makes lesser sense to +# implement it for following reasons: +# +# - performance improvement won't be anywhere near 50%, because 128- +# bit shift operation is neatly fused with 128-bit xor here, and +# "538B" variant would eliminate only 4-5 instructions out of 32 +# in the inner loop (meaning that estimated improvement is ~15%); +# - ARM-based systems are often embedded ones and extra memory +# consumption might be unappreciated (for so little improvement); +# +# Byte order [in]dependence. ========================================= +# +# Caller is expected to maintain specific *dword* order in Htable, +# namely with *least* significant dword of 128-bit value at *lower* +# address. This differs completely from C code and has everything to +# do with ldm instruction and order in which dwords are "consumed" by +# algorithm. *Byte* order within these dwords in turn is whatever +# *native* byte order on current platform. See gcm128.c for working +# example... + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$Xi="r0"; # argument block +$Htbl="r1"; +$inp="r2"; +$len="r3"; + +$Zll="r4"; # variables +$Zlh="r5"; +$Zhl="r6"; +$Zhh="r7"; +$Tll="r8"; +$Tlh="r9"; +$Thl="r10"; +$Thh="r11"; +$nlo="r12"; +################# r13 is stack pointer +$nhi="r14"; +################# r15 is program counter + +$rem_4bit=$inp; # used in gcm_gmult_4bit +$cnt=$len; + +sub Zsmash() { + my $i=12; + my @args=@_; + for ($Zll,$Zlh,$Zhl,$Zhh) { + $code.=<<___; +#if __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $_,$_ + str $_,[$Xi,#$i] +#elif defined(__ARMEB__) + str $_,[$Xi,#$i] +#else + mov $Tlh,$_,lsr#8 + strb $_,[$Xi,#$i+3] + mov $Thl,$_,lsr#16 + strb $Tlh,[$Xi,#$i+2] + mov $Thh,$_,lsr#24 + strb $Thl,[$Xi,#$i+1] + strb $Thh,[$Xi,#$i] +#endif +___ + $code.="\t".shift(@args)."\n"; + $i-=4; + } +} + +$code=<<___; +#include "arm_arch.h" + +.text +.code 32 + +.type rem_4bit,%object +.align 5 +rem_4bit: +.short 0x0000,0x1C20,0x3840,0x2460 +.short 0x7080,0x6CA0,0x48C0,0x54E0 +.short 0xE100,0xFD20,0xD940,0xC560 +.short 0x9180,0x8DA0,0xA9C0,0xB5E0 +.size rem_4bit,.-rem_4bit + +.type rem_4bit_get,%function +rem_4bit_get: + sub $rem_4bit,pc,#8 + sub $rem_4bit,$rem_4bit,#32 @ &rem_4bit + b .Lrem_4bit_got + nop +.size rem_4bit_get,.-rem_4bit_get + +.global gcm_ghash_4bit +.type gcm_ghash_4bit,%function +gcm_ghash_4bit: + sub r12,pc,#8 + add $len,$inp,$len @ $len to point at the end + stmdb sp!,{r3-r11,lr} @ save $len/end too + sub r12,r12,#48 @ &rem_4bit + + ldmia r12,{r4-r11} @ copy rem_4bit ... + stmdb sp!,{r4-r11} @ ... to stack + + ldrb $nlo,[$inp,#15] + ldrb $nhi,[$Xi,#15] +.Louter: + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + add $Thh,$Htbl,$nhi + ldrb $nlo,[$inp,#14] + + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[sp,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + ldrb $nhi,[$Xi,#14] + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + eor $nlo,$nlo,$nhi + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 + +.Linner: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[sp,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrplb $nlo,[$inp,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + ldrplb $Tll,[$Xi,$cnt] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tlh,[sp,$nhi] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eorpl $nlo,$nlo,$Tll + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem] + bpl .Linner + + ldr $len,[sp,#32] @ re-load $len/end + add $inp,$inp,#16 + mov $nhi,$Zll +___ + &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]"); +$code.=<<___; + bne .Louter + + add sp,sp,#36 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_ghash_4bit,.-gcm_ghash_4bit + +.global gcm_gmult_4bit +.type gcm_gmult_4bit,%function +gcm_gmult_4bit: + stmdb sp!,{r4-r11,lr} + ldrb $nlo,[$Xi,#15] + b rem_4bit_get +.Lrem_4bit_got: + and $nhi,$nlo,#0xf0 + and $nlo,$nlo,#0x0f + mov $cnt,#14 + + add $Zhh,$Htbl,$nlo,lsl#4 + ldmia $Zhh,{$Zll-$Zhh} @ load Htbl[nlo] + ldrb $nlo,[$Xi,#14] + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + add $nhi,$nhi,$nhi + eor $Zll,$Tll,$Zll,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + and $nhi,$nlo,#0xf0 + eor $Zhh,$Zhh,$Tll,lsl#16 + and $nlo,$nlo,#0x0f + +.Loop: + add $Thh,$Htbl,$nlo,lsl#4 + and $nlo,$Zll,#0xf @ rem + subs $cnt,$cnt,#1 + add $nlo,$nlo,$nlo + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + eor $Zlh,$Zlh,$Zhl,lsl#28 + ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem] + eor $Zhl,$Thl,$Zhl,lsr#4 + ldrplb $nlo,[$Xi,$cnt] + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + + add $Thh,$Htbl,$nhi + and $nhi,$Zll,#0xf @ rem + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + add $nhi,$nhi,$nhi + ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi] + eor $Zll,$Tll,$Zll,lsr#4 + eor $Zll,$Zll,$Zlh,lsl#28 + eor $Zlh,$Tlh,$Zlh,lsr#4 + ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem] + eor $Zlh,$Zlh,$Zhl,lsl#28 + eor $Zhl,$Thl,$Zhl,lsr#4 + eor $Zhl,$Zhl,$Zhh,lsl#28 + eor $Zhh,$Thh,$Zhh,lsr#4 + andpl $nhi,$nlo,#0xf0 + andpl $nlo,$nlo,#0x0f + eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem] + bpl .Loop +___ + &Zsmash(); +$code.=<<___; +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) +#endif +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ +{ +my $cnt=$Htbl; # $Htbl is used once in the very beginning + +my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7)); +my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15)); + +# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit +# in Zo. Or should I say "top bit", because GHASH is specified in +# reverse bit order? Otherwise straightforward 128-bt H by one input +# byte multiplication and modulo-reduction, times 16. + +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } +sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; } + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.global gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + sub $Htbl,#16 @ point at H in GCM128_CTX + vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi + vmov.i32 $mod,#0xe1 @ our irreducible polynomial + vld1.64 `&Dlo("$IN")`,[$Xi,:64]! + vshr.u64 $mod,#32 + vldmia $Htbl,{$Hhi-$Hlo} @ load H + veor $zero,$zero +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $Qpost,$Qpost + veor $R,$R + mov $cnt,#16 + veor $Z,$Z + mov $len,#16 + veor $Zo,$Zo + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte + b .Linner_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.global gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi + vmov.i32 $mod,#0xe1 @ our irreducible polynomial + vld1.64 `&Dlo("$Z")`,[$Xi,:64]! + vshr.u64 $mod,#32 + vldmia $Xi,{$Hhi-$Hlo} @ load H + veor $zero,$zero + nop +#ifdef __ARMEL__ + vrev64.8 $Z,$Z +#endif +.Louter_neon: + vld1.64 `&Dhi($IN)`,[$inp]! @ load inp + veor $Qpost,$Qpost + vld1.64 `&Dlo($IN)`,[$inp]! + veor $R,$R + mov $cnt,#16 +#ifdef __ARMEL__ + vrev64.8 $IN,$IN +#endif + veor $Zo,$Zo + veor $IN,$Z @ inp^=Xi + veor $Z,$Z + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte +.Linner_neon: + subs $cnt,$cnt,#1 + vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i] + vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i] + vext.8 $IN,$zero,#1 @ IN>>=8 + + veor $Z,$Qpost @ modulo-scheduled part + vshl.i64 `&Dlo("$R")`,#48 + vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte + veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")` + + veor `&Dhi("$Z")`,`&Dlo("$R")` + vuzp.8 $Qlo,$Qhi + vsli.8 $Zo,$T,#1 @ compose the "carry" byte + vext.8 $Z,$zero,#1 @ Z>>=8 + + vmull.p8 $R,$Zo,$mod @ "carry"·0xe1 + vshr.u8 $Zo,$T,#7 @ save Z's bottom bit + vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8 + veor $Z,$Qhi + bne .Linner_neon + + veor $Z,$Qpost @ modulo-scheduled artefact + vshl.i64 `&Dlo("$R")`,#48 + veor `&Dhi("$Z")`,`&Dlo("$R")` + + @ finalization, normalize Z:Zo + vand $Zo,$mod @ suffices to mask the bit + vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63 + vshl.i64 $Z,#1 + subs $len,#16 + vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1 + bne .Louter_neon + +#ifdef __ARMEL__ + vrev64.8 $Z,$Z +#endif + sub $Xi,#16 + vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi + vst1.64 `&Dlo("$Z")`,[$Xi,:64] + + bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +___ +} +$code.=<<___; +.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by " +.align 2 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 +print $code; +close STDOUT; # enforce flush diff --git a/src/lib/libcrypto/modes/asm/ghash-ia64.pl b/src/lib/libcrypto/modes/asm/ghash-ia64.pl new file mode 100755 index 0000000000..0354c95444 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-ia64.pl @@ -0,0 +1,463 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Streamed +# GHASH performance was measured to be 6.67 cycles per processed byte +# on Itanium 2, which is >90% better than Microsoft compiler generated +# code. To anchor to something else sha1-ia64.pl module processes one +# byte in 5.7 cycles. On Itanium GHASH should run at ~8.5 cycles per +# byte. + +# September 2010 +# +# It was originally thought that it makes lesser sense to implement +# "528B" variant on Itanium 2 for following reason. Because number of +# functional units is naturally limited, it appeared impossible to +# implement "528B" loop in 4 cycles, only in 5. This would mean that +# theoretically performance improvement couldn't be more than 20%. +# But occasionally you prove yourself wrong:-) I figured out a way to +# fold couple of instructions and having freed yet another instruction +# slot by unrolling the loop... Resulting performance is 4.45 cycles +# per processed byte and 50% better than "256B" version. On original +# Itanium performance should remain the same as the "256B" version, +# i.e. ~8.5 cycles. + +$output=shift and (open STDOUT,">$output" or die "can't open $output: $!"); + +if ($^O eq "hpux") { + $ADDP="addp4"; + for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } +} else { $ADDP="add"; } +for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); + $big_endian=0 if (/\-DL_ENDIAN/); } +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +sub loop() { +my $label=shift; +my ($p16,$p17)=(shift)?("p63","p63"):("p16","p17"); # mask references to inp + +# Loop is scheduled for 6 ticks on Itanium 2 and 8 on Itanium, i.e. +# in scalable manner;-) Naturally assuming data in L1 cache... +# Special note about 'dep' instruction, which is used to construct +# &rem_4bit[Zlo&0xf]. It works, because rem_4bit is aligned at 128 +# bytes boundary and lower 7 bits of its address are guaranteed to +# be zero. +$code.=<<___; +$label: +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p19) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p19) xor Zhi=Zhi,Hhi + ($p17) xor xi[1]=xi[1],in[1] };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p19) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p19) ld8 rem=[rem] + (p18) and Hi[1]=mask0xf0,xi[2] };; +{ .mmi; ($p16) ld1 in[0]=[inp],-1 + (p18) xor Zlo=Zlo,Hlo + (p19) shr.u Zhi=Zhi,4 } +{ .mib; (p19) xor Hhi=Hhi,rem + (p18) add Hi[1]=Htbl,Hi[1] };; + +{ .mfi; (p18) ld8 Hlo=[Hi[1]],-8 + (p18) dep rem=Zlo,rem_4bitp,3,4 } +{ .mfi; (p17) shladd Hi[0]=xi[1],4,r0 + (p18) xor Zhi=Zhi,Hhi };; +{ .mfi; (p18) ld8 Hhi=[Hi[1]] + (p18) shrp Zlo=Zhi,Zlo,4 } +{ .mfi; (p18) ld8 rem=[rem] + (p17) and Hi[0]=mask0xf0,Hi[0] };; +{ .mmi; (p16) ld1 xi[0]=[Xi],-1 + (p18) xor Zlo=Zlo,Hlo + (p18) shr.u Zhi=Zhi,4 } +{ .mib; (p18) xor Hhi=Hhi,rem + (p17) add Hi[0]=Htbl,Hi[0] + br.ctop.sptk $label };; +___ +} + +$code=<<___; +.explicit +.text + +prevfs=r2; prevlc=r3; prevpr=r8; +mask0xf0=r21; +rem=r22; rem_4bitp=r23; +Xi=r24; Htbl=r25; +inp=r26; end=r27; +Hhi=r28; Hlo=r29; +Zhi=r30; Zlo=r31; + +.align 128 +.skip 16 // aligns loop body +.global gcm_gmult_4bit# +.proc gcm_gmult_4bit# +gcm_gmult_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,2,6,0,8 + $ADDP Xi=15,in0 // &Xi[15] + mov rem_4bitp=ip } +{ .mii; $ADDP Htbl=8,in1 // &Htbl[0].lo + .save ar.lc,prevlc + mov prevlc=ar.lc + .save pr,prevpr + mov prevpr=pr };; + + .body + .rotr in[3],xi[3],Hi[2] + +{ .mib; ld1 xi[2]=[Xi],-1 // Xi[15] + mov mask0xf0=0xf0 + brp.loop.imp .Loop1,.Lend1-16};; +{ .mmi; ld1 xi[1]=[Xi],-1 // Xi[14] + };; +{ .mii; shladd Hi[1]=xi[2],4,r0 + mov pr.rot=0x7<<16 + mov ar.lc=13 };; +{ .mii; and Hi[1]=mask0xf0,Hi[1] + mov ar.ec=3 + xor Zlo=Zlo,Zlo };; +{ .mii; add Hi[1]=Htbl,Hi[1] // &Htbl[nlo].lo + add rem_4bitp=rem_4bit#-gcm_gmult_4bit#,rem_4bitp + xor Zhi=Zhi,Zhi };; +___ + &loop (".Loop1",1); +$code.=<<___; +.Lend1: +{ .mib; xor Zhi=Zhi,Hhi };; // modulo-scheduling artefact +{ .mib; mux1 Zlo=Zlo,\@rev };; +{ .mib; mux1 Zhi=Zhi,\@rev };; +{ .mmi; add Hlo=9,Xi;; // ;; is here to prevent + add Hhi=1,Xi };; // pipeline flush on Itanium +{ .mib; st8 [Hlo]=Zlo + mov pr=prevpr,0x1ffff };; +{ .mib; st8 [Hhi]=Zhi + mov ar.lc=prevlc + br.ret.sptk.many b0 };; +.endp gcm_gmult_4bit# +___ + +###################################################################### +# "528B" (well, "512B" actualy) streamed GHASH +# +$Xip="in0"; +$Htbl="in1"; +$inp="in2"; +$len="in3"; +$rem_8bit="loc0"; +$mask0xff="loc1"; +($sum,$rum) = $big_endian ? ("nop.m","nop.m") : ("sum","rum"); + +sub load_htable() { + for (my $i=0;$i<8;$i++) { + $code.=<<___; +{ .mmi; ld8 r`16+2*$i+1`=[r8],16 // Htable[$i].hi + ld8 r`16+2*$i`=[r9],16 } // Htable[$i].lo +{ .mmi; ldf8 f`32+2*$i+1`=[r10],16 // Htable[`8+$i`].hi + ldf8 f`32+2*$i`=[r11],16 // Htable[`8+$i`].lo +___ + $code.=shift if (($i+$#_)==7); + $code.="\t};;\n" + } +} + +$code.=<<___; +prevsp=r3; + +.align 32 +.skip 16 // aligns loop body +.global gcm_ghash_4bit# +.proc gcm_ghash_4bit# +gcm_ghash_4bit: + .prologue +{ .mmi; .save ar.pfs,prevfs + alloc prevfs=ar.pfs,4,2,0,0 + .vframe prevsp + mov prevsp=sp + mov $rem_8bit=ip };; + .body +{ .mfi; $ADDP r8=0+0,$Htbl + $ADDP r9=0+8,$Htbl } +{ .mfi; $ADDP r10=128+0,$Htbl + $ADDP r11=128+8,$Htbl };; +___ + &load_htable( + " $ADDP $Xip=15,$Xip", # &Xi[15] + " $ADDP $len=$len,$inp", # &inp[len] + " $ADDP $inp=15,$inp", # &inp[15] + " mov $mask0xff=0xff", + " add sp=-512,sp", + " andcm sp=sp,$mask0xff", # align stack frame + " add r14=0,sp", + " add r15=8,sp"); +$code.=<<___; +{ .mmi; $sum 1<<1 // go big-endian + add r8=256+0,sp + add r9=256+8,sp } +{ .mmi; add r10=256+128+0,sp + add r11=256+128+8,sp + add $len=-17,$len };; +___ +for($i=0;$i<8;$i++) { # generate first half of Hshr4[] +my ($rlo,$rhi)=("r".eval(16+2*$i),"r".eval(16+2*$i+1)); +$code.=<<___; +{ .mmi; st8 [r8]=$rlo,16 // Htable[$i].lo + st8 [r9]=$rhi,16 // Htable[$i].hi + shrp $rlo=$rhi,$rlo,4 }//;; +{ .mmi; stf8 [r10]=f`32+2*$i`,16 // Htable[`8+$i`].lo + stf8 [r11]=f`32+2*$i+1`,16 // Htable[`8+$i`].hi + shr.u $rhi=$rhi,4 };; +{ .mmi; st8 [r14]=$rlo,16 // Htable[$i].lo>>4 + st8 [r15]=$rhi,16 }//;; // Htable[$i].hi>>4 +___ +} +$code.=<<___; +{ .mmi; ld8 r16=[r8],16 // Htable[8].lo + ld8 r17=[r9],16 };; // Htable[8].hi +{ .mmi; ld8 r18=[r8],16 // Htable[9].lo + ld8 r19=[r9],16 } // Htable[9].hi +{ .mmi; rum 1<<5 // clear um.mfh + shrp r16=r17,r16,4 };; +___ +for($i=0;$i<6;$i++) { # generate second half of Hshr4[] +$code.=<<___; +{ .mmi; ld8 r`20+2*$i`=[r8],16 // Htable[`10+$i`].lo + ld8 r`20+2*$i+1`=[r9],16 // Htable[`10+$i`].hi + shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 + st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 + shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } +___ +} +$code.=<<___; +{ .mmi; shr.u r`16+2*$i+1`=r`16+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`16+2*$i`,16 // Htable[`8+$i`].lo>>4 + st8 [r15]=r`16+2*$i+1`,16 // Htable[`8+$i`].hi>>4 + shrp r`18+2*$i`=r`18+2*$i+1`,r`18+2*$i`,4 } +{ .mmi; add $Htbl=256,sp // &Htable[0] + add $rem_8bit=rem_8bit#-gcm_ghash_4bit#,$rem_8bit + shr.u r`18+2*$i+1`=r`18+2*$i+1`,4 };; +{ .mmi; st8 [r14]=r`18+2*$i` // Htable[`8+$i`].lo>>4 + st8 [r15]=r`18+2*$i+1` } // Htable[`8+$i`].hi>>4 +___ + +$in="r15"; +@xi=("r16","r17"); +@rem=("r18","r19"); +($Alo,$Ahi,$Blo,$Bhi,$Zlo,$Zhi)=("r20","r21","r22","r23","r24","r25"); +($Atbl,$Btbl)=("r26","r27"); + +$code.=<<___; # (p16) +{ .mmi; ld1 $in=[$inp],-1 //(p16) *inp-- + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + cmp.eq p0,p6=r0,r0 };; // clear p6 +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p16),(p17) +{ .mmi; ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mii; ld1 $in=[$inp],-1 //(p16) *inp-- + dep $Atbl=$xi[1],$Htbl,4,4 //(p17) &Htable[nlo].lo + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +.align 32 +.LOOP: +{ .mmi; +(p6) st8 [$Xip]=$Zhi,13 + xor $Zlo=$Zlo,$Zlo + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi].lo +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p16),(p17),(p18) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo +{ .mfi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo };; //(p18) Z.lo^=Htable[nlo].lo +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + ld1 $in=[$inp],-1 } //(p16) *inp-- +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + mov $Zhi=$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +for ($i=1;$i<14;$i++) { +# Above and below fragments are derived from this one by removing +# unsuitable (p??) instructions. +$code.=<<___; # (p16),(p17),(p18),(p19) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + dep $Atbl=$xi[1],$Htbl,4,4 } //(p17) &Htable[nlo].lo +{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + ld1 $in=[$inp],-1 //(p16) *inp-- + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + ld1 $xi[0]=[$Xip],-1 //(p16) *Xi-- + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers +} + +$code.=<<___; # (p17),(p18),(p19) +{ .mmi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + ld8 $rem[0]=[$Btbl],-256 //(p18) Htable[nhi].lo,&Hshr4[nhi].lo + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + xor $xi[1]=$xi[1],$in };; //(p17) xi=$xi[i]^inp[i] +{ .mmi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + dep $Atbl=$xi[1],$Htbl,4,4 };; //(p17) &Htable[nlo].lo +{ .mmi; shladd $rem[0]=$rem[0],4,r0 //(p18) Htable[nhi].lo<<4 + xor $Zlo=$Zlo,$Alo //(p18) Z.lo^=Htable[nlo].lo + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mmi; ld8 $Blo=[$Btbl],8 //(p18) Hshr4[nhi].lo,&Hshr4[nhi].hi + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mmi; xor $rem[0]=$rem[0],$Zlo //(p18) Z.lo^(Htable[nhi].lo<<4) + xor $Zhi=$Zhi,$Ahi //(p18) Z.hi^=Htable[nlo].hi + and $xi[1]=-16,$xi[1] };; //(p17) nhi=xi&0xf0 +{ .mmi; ld8 $Bhi=[$Btbl] //(p18) Hshr4[nhi].hi + shrp $Zlo=$Zhi,$Zlo,8 } //(p18) Z.lo=(Z.hi<<56)|(Z.lo>>8) +{ .mmi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 + add $Btbl=$xi[1],$Htbl };; //(p17) &Htable[nhi] +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p18),(p19) +{ .mfi; ld8 $Alo=[$Atbl],8 //(p18) Htable[nlo].lo,&Htable[nlo].hi + shr.u $Zhi=$Zhi,8 } //(p19) Z.hi>>=8 +{ .mfi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo };; //(p19) Z.lo^=Hshr4[nhi].lo +{ .mfi; ld8 $Ahi=[$Atbl] //(p18) Htable[nlo].hi + xor $Zlo=$Zlo,$Alo } //(p18) Z.lo^=Htable[nlo].lo +{ .mfi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] + xor $Zhi=$Zhi,$Bhi };; //(p19) Z.hi^=Hshr4[nhi].hi +{ .mfi; ld8 $Blo=[$Btbl],8 //(p18) Htable[nhi].lo,&Htable[nhi].hi + shl $rem[1]=$rem[1],48 } //(p19) rem_8bit[rem]<<48 +{ .mfi; shladd $rem[0]=$Zlo,4,r0 //(p18) Z.lo<<4 + xor $Zhi=$Zhi,$Ahi };; //(p18) Z.hi^=Htable[nlo].hi +{ .mfi; ld8 $Bhi=[$Btbl] //(p18) Htable[nhi].hi + shrp $Zlo=$Zhi,$Zlo,4 } //(p18) Z.lo=(Z.hi<<60)|(Z.lo>>4) +{ .mfi; and $rem[0]=$rem[0],$mask0xff //(p18) rem=($Zlo^(Htable[nhi].lo<<4))&0xff + xor $Zhi=$Zhi,$rem[1] };; //(p19) Z.hi^=rem_8bit[rem]<<48 +___ +push (@xi,shift(@xi)); push (@rem,shift(@rem)); # "rotate" registers + +$code.=<<___; # (p19) +{ .mmi; cmp.ltu p6,p0=$inp,$len + add $inp=32,$inp + shr.u $Zhi=$Zhi,4 } //(p19) Z.hi>>=4 +{ .mmi; shladd $rem[1]=$rem[1],1,$rem_8bit //(p19) &rem_8bit[rem] + xor $Zlo=$Zlo,$Blo //(p19) Z.lo^=Hshr4[nhi].lo + add $Xip=9,$Xip };; // &Xi.lo +{ .mmi; ld2 $rem[1]=[$rem[1]] //(p19) rem_8bit[rem] +(p6) ld1 $in=[$inp],-1 //[p16] *inp-- +(p6) extr.u $xi[1]=$Zlo,8,8 } //[p17] Xi[14] +{ .mmi; xor $Zhi=$Zhi,$Bhi //(p19) Z.hi^=Hshr4[nhi].hi +(p6) and $xi[0]=$Zlo,$mask0xff };; //[p16] Xi[15] +{ .mmi; st8 [$Xip]=$Zlo,-8 +(p6) xor $xi[0]=$xi[0],$in //[p17] xi=$xi[i]^inp[i] + shl $rem[1]=$rem[1],48 };; //(p19) rem_8bit[rem]<<48 +{ .mmi; +(p6) ld1 $in=[$inp],-1 //[p16] *inp-- + xor $Zhi=$Zhi,$rem[1] //(p19) Z.hi^=rem_8bit[rem]<<48 +(p6) dep $Atbl=$xi[0],$Htbl,4,4 } //[p17] &Htable[nlo].lo +{ .mib; +(p6) and $xi[0]=-16,$xi[0] //[p17] nhi=xi&0xf0 +(p6) br.cond.dptk.many .LOOP };; + +{ .mib; st8 [$Xip]=$Zhi };; +{ .mib; $rum 1<<1 // return to little-endian + .restore sp + mov sp=prevsp + br.ret.sptk.many b0 };; +.endp gcm_ghash_4bit# +___ +$code.=<<___; +.align 128 +.type rem_4bit#,\@object +rem_4bit: + data8 0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48 + data8 0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48 + data8 0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48 + data8 0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48 +.size rem_4bit#,128 +.type rem_8bit#,\@object +rem_8bit: + data1 0x00,0x00, 0x01,0xC2, 0x03,0x84, 0x02,0x46, 0x07,0x08, 0x06,0xCA, 0x04,0x8C, 0x05,0x4E + data1 0x0E,0x10, 0x0F,0xD2, 0x0D,0x94, 0x0C,0x56, 0x09,0x18, 0x08,0xDA, 0x0A,0x9C, 0x0B,0x5E + data1 0x1C,0x20, 0x1D,0xE2, 0x1F,0xA4, 0x1E,0x66, 0x1B,0x28, 0x1A,0xEA, 0x18,0xAC, 0x19,0x6E + data1 0x12,0x30, 0x13,0xF2, 0x11,0xB4, 0x10,0x76, 0x15,0x38, 0x14,0xFA, 0x16,0xBC, 0x17,0x7E + data1 0x38,0x40, 0x39,0x82, 0x3B,0xC4, 0x3A,0x06, 0x3F,0x48, 0x3E,0x8A, 0x3C,0xCC, 0x3D,0x0E + data1 0x36,0x50, 0x37,0x92, 0x35,0xD4, 0x34,0x16, 0x31,0x58, 0x30,0x9A, 0x32,0xDC, 0x33,0x1E + data1 0x24,0x60, 0x25,0xA2, 0x27,0xE4, 0x26,0x26, 0x23,0x68, 0x22,0xAA, 0x20,0xEC, 0x21,0x2E + data1 0x2A,0x70, 0x2B,0xB2, 0x29,0xF4, 0x28,0x36, 0x2D,0x78, 0x2C,0xBA, 0x2E,0xFC, 0x2F,0x3E + data1 0x70,0x80, 0x71,0x42, 0x73,0x04, 0x72,0xC6, 0x77,0x88, 0x76,0x4A, 0x74,0x0C, 0x75,0xCE + data1 0x7E,0x90, 0x7F,0x52, 0x7D,0x14, 0x7C,0xD6, 0x79,0x98, 0x78,0x5A, 0x7A,0x1C, 0x7B,0xDE + data1 0x6C,0xA0, 0x6D,0x62, 0x6F,0x24, 0x6E,0xE6, 0x6B,0xA8, 0x6A,0x6A, 0x68,0x2C, 0x69,0xEE + data1 0x62,0xB0, 0x63,0x72, 0x61,0x34, 0x60,0xF6, 0x65,0xB8, 0x64,0x7A, 0x66,0x3C, 0x67,0xFE + data1 0x48,0xC0, 0x49,0x02, 0x4B,0x44, 0x4A,0x86, 0x4F,0xC8, 0x4E,0x0A, 0x4C,0x4C, 0x4D,0x8E + data1 0x46,0xD0, 0x47,0x12, 0x45,0x54, 0x44,0x96, 0x41,0xD8, 0x40,0x1A, 0x42,0x5C, 0x43,0x9E + data1 0x54,0xE0, 0x55,0x22, 0x57,0x64, 0x56,0xA6, 0x53,0xE8, 0x52,0x2A, 0x50,0x6C, 0x51,0xAE + data1 0x5A,0xF0, 0x5B,0x32, 0x59,0x74, 0x58,0xB6, 0x5D,0xF8, 0x5C,0x3A, 0x5E,0x7C, 0x5F,0xBE + data1 0xE1,0x00, 0xE0,0xC2, 0xE2,0x84, 0xE3,0x46, 0xE6,0x08, 0xE7,0xCA, 0xE5,0x8C, 0xE4,0x4E + data1 0xEF,0x10, 0xEE,0xD2, 0xEC,0x94, 0xED,0x56, 0xE8,0x18, 0xE9,0xDA, 0xEB,0x9C, 0xEA,0x5E + data1 0xFD,0x20, 0xFC,0xE2, 0xFE,0xA4, 0xFF,0x66, 0xFA,0x28, 0xFB,0xEA, 0xF9,0xAC, 0xF8,0x6E + data1 0xF3,0x30, 0xF2,0xF2, 0xF0,0xB4, 0xF1,0x76, 0xF4,0x38, 0xF5,0xFA, 0xF7,0xBC, 0xF6,0x7E + data1 0xD9,0x40, 0xD8,0x82, 0xDA,0xC4, 0xDB,0x06, 0xDE,0x48, 0xDF,0x8A, 0xDD,0xCC, 0xDC,0x0E + data1 0xD7,0x50, 0xD6,0x92, 0xD4,0xD4, 0xD5,0x16, 0xD0,0x58, 0xD1,0x9A, 0xD3,0xDC, 0xD2,0x1E + data1 0xC5,0x60, 0xC4,0xA2, 0xC6,0xE4, 0xC7,0x26, 0xC2,0x68, 0xC3,0xAA, 0xC1,0xEC, 0xC0,0x2E + data1 0xCB,0x70, 0xCA,0xB2, 0xC8,0xF4, 0xC9,0x36, 0xCC,0x78, 0xCD,0xBA, 0xCF,0xFC, 0xCE,0x3E + data1 0x91,0x80, 0x90,0x42, 0x92,0x04, 0x93,0xC6, 0x96,0x88, 0x97,0x4A, 0x95,0x0C, 0x94,0xCE + data1 0x9F,0x90, 0x9E,0x52, 0x9C,0x14, 0x9D,0xD6, 0x98,0x98, 0x99,0x5A, 0x9B,0x1C, 0x9A,0xDE + data1 0x8D,0xA0, 0x8C,0x62, 0x8E,0x24, 0x8F,0xE6, 0x8A,0xA8, 0x8B,0x6A, 0x89,0x2C, 0x88,0xEE + data1 0x83,0xB0, 0x82,0x72, 0x80,0x34, 0x81,0xF6, 0x84,0xB8, 0x85,0x7A, 0x87,0x3C, 0x86,0xFE + data1 0xA9,0xC0, 0xA8,0x02, 0xAA,0x44, 0xAB,0x86, 0xAE,0xC8, 0xAF,0x0A, 0xAD,0x4C, 0xAC,0x8E + data1 0xA7,0xD0, 0xA6,0x12, 0xA4,0x54, 0xA5,0x96, 0xA0,0xD8, 0xA1,0x1A, 0xA3,0x5C, 0xA2,0x9E + data1 0xB5,0xE0, 0xB4,0x22, 0xB6,0x64, 0xB7,0xA6, 0xB2,0xE8, 0xB3,0x2A, 0xB1,0x6C, 0xB0,0xAE + data1 0xBB,0xF0, 0xBA,0x32, 0xB8,0x74, 0xB9,0xB6, 0xBC,0xF8, 0xBD,0x3A, 0xBF,0x7C, 0xBE,0xBE +.size rem_8bit#,512 +stringz "GHASH for IA64, CRYPTOGAMS by " +___ + +$code =~ s/mux1(\s+)\S+\@rev/nop.i$1 0x0/gm if ($big_endian); +$code =~ s/\`([^\`]*)\`/eval $1/gem; + +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/modes/asm/ghash-parisc.pl b/src/lib/libcrypto/modes/asm/ghash-parisc.pl new file mode 100644 index 0000000000..8c7454ee93 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-parisc.pl @@ -0,0 +1,730 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# April 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC +# it processes one byte in 19.6 cycles, which is more than twice as +# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for +# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per +# processed byte. This is ~2.2x faster than 64-bit code generated by +# vendor compiler (which used to be very hard to beat:-). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; + $NREGS =6; +} else { + $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; + $NREGS =11; +} + +$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker + # [+ argument transfer] + +################# volatile registers +$Xi="%r26"; # argument block +$Htbl="%r25"; +$inp="%r24"; +$len="%r23"; +$Hhh=$Htbl; # variables +$Hll="%r22"; +$Zhh="%r21"; +$Zll="%r20"; +$cnt="%r19"; +$rem_4bit="%r28"; +$rem="%r29"; +$mask0xf0="%r31"; + +################# preserved registers +$Thh="%r1"; +$Tll="%r2"; +$nlo="%r3"; +$nhi="%r4"; +$byte="%r5"; +if ($SIZE_T==4) { + $Zhl="%r6"; + $Zlh="%r7"; + $Hhl="%r8"; + $Hlh="%r9"; + $Thl="%r10"; + $Tlh="%r11"; +} +$rem2="%r6"; # used in PA-RISC 2.0 code + +$code.=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 64 +gcm_gmult_4bit + .PROC + .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) +___ +$code.=<<___ if ($SIZE_T==4); + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) +___ +$code.=<<___; + blr %r0,$rem_4bit + ldi 3,$rem +L\$pic_gmult + andcm $rem_4bit,$rem,$rem_4bit + addl $inp,$len,$len + ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit + ldi 0xf0,$mask0xf0 +___ +$code.=<<___ if ($SIZE_T==4); + ldi 31,$rem + mtctl $rem,%cr11 + extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 + b L\$parisc1_gmult + nop +___ + +$code.=<<___; + ldb 15($Xi),$nlo + ldo 8($Htbl),$Hll + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + ldd $nlo($Hll),$Zll + ldd $nlo($Hhh),$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldb 14($Xi),$nlo + + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + b L\$oop_gmult_pa2 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_gmult_pa2 + xor $rem,$Zhh,$Zhh ; moved here to work around gas bug + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + ldbx $cnt($Xi),$nlo + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + ldd $rem($rem_4bit),$rem + + xor $Tll,$Zll,$Zll + addib,uv -1,$cnt,L\$oop_gmult_pa2 + xor $Thh,$Zhh,$Zhh + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + std $Zll,8($Xi) + std $Zhh,0($Xi) +___ + +$code.=<<___ if ($SIZE_T==4); + b L\$done_gmult + nop + +L\$parisc1_gmult + ldb 15($Xi),$nlo + ldo 12($Htbl),$Hll + ldo 8($Htbl),$Hlh + ldo 4($Htbl),$Hhl + + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + ldwx $nlo($Hll),$Zll + ldwx $nlo($Hlh),$Zlh + ldwx $nlo($Hhl),$Zhl + ldwx $nlo($Hhh),$Zhh + zdep $Zll,28,4,$rem + ldb 14($Xi),$nlo + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhl,$Zlh,4,$Zlh + ldwx $nhi($Hlh),$Tlh + shrpw $Zhh,$Zhl,4,$Zhl + ldwx $nhi($Hhl),$Thl + extru $Zhh,27,28,$Zhh + ldwx $nhi($Hhh),$Thh + xor $rem,$Zhh,$Zhh + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $Thl,$Zhl,$Zhl + b L\$oop_gmult_pa1 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_gmult_pa1 + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + ldbx $cnt($Xi),$nlo + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $rem,$Zhh,$Zhh + zdep $Zll,28,4,$rem + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + shrpw $Zlh,$Zll,4,$Zll + ldwx $rem($rem_4bit),$rem + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + and $mask0xf0,$nlo,$nhi + extru $Zhh,27,28,$Zhh + zdep $nlo,27,4,$nlo + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $rem,$Zhh,$Zhh + addib,uv -1,$cnt,L\$oop_gmult_pa1 + xor $Thl,$Zhl,$Zhl + + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $rem,$Zhh,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + zdep $Zll,28,4,$rem + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + extru $Zhh,27,28,$Zhh + xor $Tll,$Zll,$Zll + xor $Tlh,$Zlh,$Zlh + xor $rem,$Zhh,$Zhh + stw $Zll,12($Xi) + xor $Thl,$Zhl,$Zhl + stw $Zlh,8($Xi) + xor $Thh,$Zhh,$Zhh + stw $Zhl,4($Xi) + stw $Zhh,0($Xi) +___ +$code.=<<___; +L\$done_gmult + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +___ +$code.=<<___ if ($SIZE_T==4); + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 +___ +$code.=<<___; + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR + .ALIGN 64 +gcm_ghash_4bit + .PROC + .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) +___ +$code.=<<___ if ($SIZE_T==4); + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) +___ +$code.=<<___; + blr %r0,$rem_4bit + ldi 3,$rem +L\$pic_ghash + andcm $rem_4bit,$rem,$rem_4bit + addl $inp,$len,$len + ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit + ldi 0xf0,$mask0xf0 +___ +$code.=<<___ if ($SIZE_T==4); + ldi 31,$rem + mtctl $rem,%cr11 + extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 + b L\$parisc1_ghash + nop +___ + +$code.=<<___; + ldb 15($Xi),$nlo + ldo 8($Htbl),$Hll + +L\$outer_ghash_pa2 + ldb 15($inp),$nhi + xor $nhi,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + ldd $nlo($Hll),$Zll + ldd $nlo($Hhh),$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldb 14($Xi),$nlo + ldb 14($inp),$byte + + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + xor $byte,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + b L\$oop_ghash_pa2 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_ghash_pa2 + xor $rem,$Zhh,$Zhh ; moved here to work around gas bug + depd,z $Zll,60,4,$rem2 + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldbx $cnt($Xi),$nlo + ldbx $cnt($inp),$byte + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + ldd $rem2($rem_4bit),$rem2 + + xor $rem2,$Zhh,$Zhh + xor $byte,$nlo,$nlo + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + and $mask0xf0,$nlo,$nhi + depd,z $nlo,59,4,$nlo + + extrd,u $Zhh,59,60,$Zhh + xor $Tll,$Zll,$Zll + + ldd $rem($rem_4bit),$rem + addib,uv -1,$cnt,L\$oop_ghash_pa2 + xor $Thh,$Zhh,$Zhh + + xor $rem,$Zhh,$Zhh + depd,z $Zll,60,4,$rem2 + + shrpd $Zhh,$Zll,4,$Zll + extrd,u $Zhh,59,60,$Zhh + ldd $nlo($Hll),$Tll + ldd $nlo($Hhh),$Thh + + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + + depd,z $Zll,60,4,$rem + shrpd $Zhh,$Zll,4,$Zll + ldd $rem2($rem_4bit),$rem2 + + xor $rem2,$Zhh,$Zhh + ldd $nhi($Hll),$Tll + ldd $nhi($Hhh),$Thh + + extrd,u $Zhh,59,60,$Zhh + xor $Tll,$Zll,$Zll + xor $Thh,$Zhh,$Zhh + ldd $rem($rem_4bit),$rem + + xor $rem,$Zhh,$Zhh + std $Zll,8($Xi) + ldo 16($inp),$inp + std $Zhh,0($Xi) + cmpb,*<> $inp,$len,L\$outer_ghash_pa2 + copy $Zll,$nlo +___ + +$code.=<<___ if ($SIZE_T==4); + b L\$done_ghash + nop + +L\$parisc1_ghash + ldb 15($Xi),$nlo + ldo 12($Htbl),$Hll + ldo 8($Htbl),$Hlh + ldo 4($Htbl),$Hhl + +L\$outer_ghash_pa1 + ldb 15($inp),$byte + xor $byte,$nlo,$nlo + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + ldwx $nlo($Hll),$Zll + ldwx $nlo($Hlh),$Zlh + ldwx $nlo($Hhl),$Zhl + ldwx $nlo($Hhh),$Zhh + zdep $Zll,28,4,$rem + ldb 14($Xi),$nlo + ldb 14($inp),$byte + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhl,$Zlh,4,$Zlh + ldwx $nhi($Hlh),$Tlh + shrpw $Zhh,$Zhl,4,$Zhl + ldwx $nhi($Hhl),$Thl + extru $Zhh,27,28,$Zhh + ldwx $nhi($Hhh),$Thh + xor $byte,$nlo,$nlo + xor $rem,$Zhh,$Zhh + and $mask0xf0,$nlo,$nhi + zdep $nlo,27,4,$nlo + + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $Thl,$Zhl,$Zhl + b L\$oop_ghash_pa1 + ldi 13,$cnt + + .ALIGN 8 +L\$oop_ghash_pa1 + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + ldbx $cnt($Xi),$nlo + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + ldbx $cnt($inp),$byte + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $rem,$Zhh,$Zhh + zdep $Zll,28,4,$rem + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + shrpw $Zlh,$Zll,4,$Zll + ldwx $rem($rem_4bit),$rem + shrpw $Zhl,$Zlh,4,$Zlh + xor $byte,$nlo,$nlo + shrpw $Zhh,$Zhl,4,$Zhl + and $mask0xf0,$nlo,$nhi + extru $Zhh,27,28,$Zhh + zdep $nlo,27,4,$nlo + xor $Tll,$Zll,$Zll + ldwx $nlo($Hll),$Tll + xor $Tlh,$Zlh,$Zlh + ldwx $nlo($Hlh),$Tlh + xor $rem,$Zhh,$Zhh + addib,uv -1,$cnt,L\$oop_ghash_pa1 + xor $Thl,$Zhl,$Zhl + + zdep $Zll,28,4,$rem + ldwx $nlo($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + ldwx $nlo($Hhh),$Thh + shrpw $Zhl,$Zlh,4,$Zlh + xor $Tll,$Zll,$Zll + ldwx $nhi($Hll),$Tll + shrpw $Zhh,$Zhl,4,$Zhl + xor $Tlh,$Zlh,$Zlh + ldwx $nhi($Hlh),$Tlh + extru $Zhh,27,28,$Zhh + xor $rem,$Zhh,$Zhh + xor $Thl,$Zhl,$Zhl + ldwx $nhi($Hhl),$Thl + xor $Thh,$Zhh,$Zhh + ldwx $nhi($Hhh),$Thh + zdep $Zll,28,4,$rem + ldwx $rem($rem_4bit),$rem + shrpw $Zlh,$Zll,4,$Zll + shrpw $Zhl,$Zlh,4,$Zlh + shrpw $Zhh,$Zhl,4,$Zhl + extru $Zhh,27,28,$Zhh + xor $Tll,$Zll,$Zll + xor $Tlh,$Zlh,$Zlh + xor $rem,$Zhh,$Zhh + stw $Zll,12($Xi) + xor $Thl,$Zhl,$Zhl + stw $Zlh,8($Xi) + xor $Thh,$Zhh,$Zhh + stw $Zhl,4($Xi) + ldo 16($inp),$inp + stw $Zhh,0($Xi) + comb,<> $inp,$len,L\$outer_ghash_pa1 + copy $Zll,$nlo +___ +$code.=<<___; +L\$done_ghash + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +___ +$code.=<<___ if ($SIZE_T==4); + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 +___ +$code.=<<___; + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + + .ALIGN 64 +L\$rem_4bit + .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 + .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 + .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 + .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 + .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by " + .ALIGN 64 +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 + { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 + { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; + $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset + $opcode|=(1<<5) if ($mod =~ /^,m/); + $opcode|=(1<<13) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +my $depd = sub { + my ($mod,$args) = @_; + my $orig = "depd$mod\t$args"; + + # I only have ",z" completer, it's impicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 + { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); + my $cpos=63-$2; + my $len=32-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + if ($SIZE_T==4) { + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; + s/cmpb,\*/comb,/; + s/,\*/,/; + } + print $_,"\n"; +} + +close STDOUT; diff --git a/src/lib/libcrypto/modes/asm/ghash-s390x.pl b/src/lib/libcrypto/modes/asm/ghash-s390x.pl new file mode 100644 index 0000000000..6a40d5d89c --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-s390x.pl @@ -0,0 +1,262 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# September 2010. +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# was measured to be ~18 cycles per processed byte on z10, which is +# almost 40% better than gcc-generated code. It should be noted that +# 18 cycles is worse result than expected: loop is scheduled for 12 +# and the result should be close to 12. In the lack of instruction- +# level profiling data it's impossible to tell why... + +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 2.8x better than 32-bit code generated by gcc 4.3. + +# March 2011. +# +# Support for hardware KIMD-GHASH is verified to produce correct +# result and therefore is engaged. On z196 it was measured to process +# 8KB buffer ~7 faster than software implementation. It's not as +# impressive for smaller buffer sizes and for smallest 16-bytes buffer +# it's actually almost 2 times slower. Which is the reason why +# KIMD-GHASH is not used in gcm_gmult_4bit. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + +$softonly=0; + +$Zhi="%r0"; +$Zlo="%r1"; + +$Xi="%r2"; # argument block +$Htbl="%r3"; +$inp="%r4"; +$len="%r5"; + +$rem0="%r6"; # variables +$rem1="%r7"; +$nlo="%r8"; +$nhi="%r9"; +$xi="%r10"; +$cnt="%r11"; +$tmp="%r12"; +$x78="%r13"; +$rem_4bit="%r14"; + +$sp="%r15"; + +$code.=<<___; +.text + +.globl gcm_gmult_4bit +.align 32 +gcm_gmult_4bit: +___ +$code.=<<___ if(!$softonly && 0); # hardware is slow for single block... + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security-assist + jz .Lsoft_gmult + lghi %r0,0 + la %r1,16($sp) + .long 0xb93e0004 # kimd %r0,%r4 + lg %r1,24($sp) + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_gmult + stg %r0,16($sp) # arrange 16 bytes of zero input + stg %r0,24($sp) + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + la $inp,16($sp) + lghi $len,16 + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_gmult: +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) + + aghi $Xi,-1 + lghi $len,1 + lghi $x78,`0xf<<3` + larl $rem_4bit,rem_4bit + + lg $Zlo,8+1($Xi) # Xi + j .Lgmult_shortcut +.type gcm_gmult_4bit,\@function +.size gcm_gmult_4bit,(.-gcm_gmult_4bit) + +.globl gcm_ghash_4bit +.align 32 +gcm_ghash_4bit: +___ +$code.=<<___ if(!$softonly); + larl %r1,OPENSSL_s390xcap_P + lg %r0,0(%r1) + tmhl %r0,0x4000 # check for message-security-assist + jz .Lsoft_ghash + lghi %r0,0 + la %r1,16($sp) + .long 0xb93e0004 # kimd %r0,%r4 + lg %r1,24($sp) + tmhh %r1,0x4000 # check for function 65 + jz .Lsoft_ghash + lghi %r0,65 # function 65 + la %r1,0($Xi) # H lies right after Xi in gcm128_context + .long 0xb93e0004 # kimd %r0,$inp + brc 1,.-4 # pay attention to "partial completion" + br %r14 +.align 32 +.Lsoft_ghash: +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; + stm${g} %r6,%r14,6*$SIZE_T($sp) + + aghi $Xi,-1 + srlg $len,$len,4 + lghi $x78,`0xf<<3` + larl $rem_4bit,rem_4bit + + lg $Zlo,8+1($Xi) # Xi + lg $Zhi,0+1($Xi) + lghi $tmp,0 +.Louter: + xg $Zhi,0($inp) # Xi ^= inp + xg $Zlo,8($inp) + xgr $Zhi,$tmp + stg $Zlo,8+1($Xi) + stg $Zhi,0+1($Xi) + +.Lgmult_shortcut: + lghi $tmp,0xf0 + sllg $nlo,$Zlo,4 + srlg $xi,$Zlo,8 # extract second byte + ngr $nlo,$tmp + lgr $nhi,$Zlo + lghi $cnt,14 + ngr $nhi,$tmp + + lg $Zlo,8($nlo,$Htbl) + lg $Zhi,0($nlo,$Htbl) + + sllg $nlo,$xi,4 + sllg $rem0,$Zlo,3 + ngr $nlo,$tmp + ngr $rem0,$x78 + ngr $xi,$tmp + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + lgr $nhi,$xi + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + j .Lghash_inner +.align 16 +.Lghash_inner: + srlg $Zlo,$Zlo,4 + sllg $tmp,$Zhi,60 + xg $Zlo,8($nlo,$Htbl) + srlg $Zhi,$Zhi,4 + llgc $xi,0($cnt,$Xi) + xg $Zhi,0($nlo,$Htbl) + sllg $nlo,$xi,4 + xg $Zhi,0($rem0,$rem_4bit) + nill $nlo,0xf0 + sllg $rem0,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem0,$x78 + nill $xi,0xf0 + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + lgr $nhi,$xi + xg $Zhi,0($rem1,$rem_4bit) + sllg $rem1,$Zlo,3 + xgr $Zlo,$tmp + ngr $rem1,$x78 + brct $cnt,.Lghash_inner + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nlo,$Htbl) + xg $Zhi,0($nlo,$Htbl) + sllg $xi,$Zlo,3 + xg $Zhi,0($rem0,$rem_4bit) + xgr $Zlo,$tmp + ngr $xi,$x78 + + sllg $tmp,$Zhi,60 + srlg $Zlo,$Zlo,4 + srlg $Zhi,$Zhi,4 + xg $Zlo,8($nhi,$Htbl) + xg $Zhi,0($nhi,$Htbl) + xgr $Zlo,$tmp + xg $Zhi,0($rem1,$rem_4bit) + + lg $tmp,0($xi,$rem_4bit) + la $inp,16($inp) + sllg $tmp,$tmp,4 # correct last rem_4bit[rem] + brctg $len,.Louter + + xgr $Zhi,$tmp + stg $Zlo,8+1($Xi) + stg $Zhi,0+1($Xi) + lm${g} %r6,%r14,6*$SIZE_T($sp) + br %r14 +.type gcm_ghash_4bit,\@function +.size gcm_ghash_4bit,(.-gcm_ghash_4bit) + +.align 64 +rem_4bit: + .long `0x0000<<12`,0,`0x1C20<<12`,0,`0x3840<<12`,0,`0x2460<<12`,0 + .long `0x7080<<12`,0,`0x6CA0<<12`,0,`0x48C0<<12`,0,`0x54E0<<12`,0 + .long `0xE100<<12`,0,`0xFD20<<12`,0,`0xD940<<12`,0,`0xC560<<12`,0 + .long `0x9180<<12`,0,`0x8DA0<<12`,0,`0xA9C0<<12`,0,`0xB5E0<<12`,0 +.type rem_4bit,\@object +.size rem_4bit,(.-rem_4bit) +.string "GHASH for s390x, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl new file mode 100644 index 0000000000..70e7b044a3 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-sparcv9.pl @@ -0,0 +1,330 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# March 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+128 bytes shared table]. Performance +# results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU +# and are expressed in cycles per processed byte, less is better: +# +# gcc 3.3.x cc 5.2 this assembler +# +# 32-bit build 81.4 43.3 12.6 (+546%/+244%) +# 64-bit build 20.2 21.2 12.6 (+60%/+68%) +# +# Here is data collected on UltraSPARC T1 system running Linux: +# +# gcc 4.4.1 this assembler +# +# 32-bit build 566 50 (+1000%) +# 64-bit build 56 50 (+12%) +# +# I don't quite understand why difference between 32-bit and 64-bit +# compiler-generated code is so big. Compilers *were* instructed to +# generate code for UltraSPARC and should have used 64-bit registers +# for Z vector (see C code) even in 32-bit build... Oh well, it only +# means more impressive improvement coefficients for this assembler +# module;-) Loops are aggressively modulo-scheduled in respect to +# references to input data and Z.hi updates to achieve 12 cycles +# timing. To anchor to something else, sha1-sparcv9.pl spends 11.6 +# cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1. + +$bits=32; +for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); } +if ($bits==64) { $bias=2047; $frame=192; } +else { $bias=0; $frame=112; } + +$output=shift; +open STDOUT,">$output"; + +$Zhi="%o0"; # 64-bit values +$Zlo="%o1"; +$Thi="%o2"; +$Tlo="%o3"; +$rem="%o4"; +$tmp="%o5"; + +$nhi="%l0"; # small values and pointers +$nlo="%l1"; +$xi0="%l2"; +$xi1="%l3"; +$rem_4bit="%l4"; +$remi="%l5"; +$Htblo="%l6"; +$cnt="%l7"; + +$Xi="%i0"; # input argument block +$Htbl="%i1"; +$inp="%i2"; +$len="%i3"; + +$code.=<<___; +.section ".text",#alloc,#execinstr + +.align 64 +rem_4bit: + .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 + .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 + .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 + .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 +.type rem_4bit,#object +.size rem_4bit,(.-rem_4bit) + +.globl gcm_ghash_4bit +.align 32 +gcm_ghash_4bit: + save %sp,-$frame,%sp + ldub [$inp+15],$nlo + ldub [$Xi+15],$xi0 + ldub [$Xi+14],$xi1 + add $len,$inp,$len + add $Htbl,8,$Htblo + +1: call .+8 + add %o7,rem_4bit-1b,$rem_4bit + +.Louter: + xor $xi0,$nlo,$nlo + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + sll $nlo,4,$nlo + ldx [$Htblo+$nlo],$Zlo + ldx [$Htbl+$nlo],$Zhi + + ldub [$inp+14],$nlo + + ldx [$Htblo+$nhi],$Tlo + and $Zlo,0xf,$remi + ldx [$Htbl+$nhi],$Thi + sll $remi,3,$remi + ldx [$rem_4bit+$remi],$rem + srlx $Zlo,4,$Zlo + mov 13,$cnt + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + + xor $xi1,$nlo,$nlo + and $Zlo,0xf,$remi + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + ba .Lghash_inner + sll $nlo,4,$nlo +.align 32 +.Lghash_inner: + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$inp+$cnt],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + ldub [$Xi+$cnt],$xi1 + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $xi1,$nlo,$nlo + srlx $Zhi,4,$Zhi + and $nlo,0xf0,$nhi + addcc $cnt,-1,$cnt + xor $Zlo,$tmp,$Zlo + and $nlo,0x0f,$nlo + xor $Tlo,$Zlo,$Zlo + sll $nlo,4,$nlo + blu .Lghash_inner + and $Zlo,0xf,$remi + + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + + add $inp,16,$inp + cmp $inp,$len + be,pn `$bits==64?"%xcc":"%icc"`,.Ldone + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$inp+15],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + srl $Zlo,8,$xi1 + and $Zlo,0xff,$xi0 + ba .Louter + and $xi1,0xff,$xi1 +.align 32 +.Ldone: + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + + ret + restore +.type gcm_ghash_4bit,#function +.size gcm_ghash_4bit,(.-gcm_ghash_4bit) +___ + +undef $inp; +undef $len; + +$code.=<<___; +.globl gcm_gmult_4bit +.align 32 +gcm_gmult_4bit: + save %sp,-$frame,%sp + ldub [$Xi+15],$nlo + add $Htbl,8,$Htblo + +1: call .+8 + add %o7,rem_4bit-1b,$rem_4bit + + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + sll $nlo,4,$nlo + ldx [$Htblo+$nlo],$Zlo + ldx [$Htbl+$nlo],$Zhi + + ldub [$Xi+14],$nlo + + ldx [$Htblo+$nhi],$Tlo + and $Zlo,0xf,$remi + ldx [$Htbl+$nhi],$Thi + sll $remi,3,$remi + ldx [$rem_4bit+$remi],$rem + srlx $Zlo,4,$Zlo + mov 13,$cnt + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + + and $Zlo,0xf,$remi + and $nlo,0xf0,$nhi + and $nlo,0x0f,$nlo + ba .Lgmult_inner + sll $nlo,4,$nlo +.align 32 +.Lgmult_inner: + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + ldub [$Xi+$cnt],$nlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + srlx $Zhi,4,$Zhi + and $nlo,0xf0,$nhi + addcc $cnt,-1,$cnt + xor $Zlo,$tmp,$Zlo + and $nlo,0x0f,$nlo + xor $Tlo,$Zlo,$Zlo + sll $nlo,4,$nlo + blu .Lgmult_inner + and $Zlo,0xf,$remi + + ldx [$Htblo+$nlo],$Tlo + sll $remi,3,$remi + xor $Thi,$Zhi,$Zhi + ldx [$Htbl+$nlo],$Thi + srlx $Zlo,4,$Zlo + xor $rem,$Zhi,$Zhi + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + and $Zlo,0xf,$remi + + ldx [$Htblo+$nhi],$Tlo + sll $remi,3,$remi + xor $rem,$Zhi,$Zhi + ldx [$Htbl+$nhi],$Thi + srlx $Zlo,4,$Zlo + ldx [$rem_4bit+$remi],$rem + sllx $Zhi,60,$tmp + xor $Tlo,$Zlo,$Zlo + srlx $Zhi,4,$Zhi + xor $Zlo,$tmp,$Zlo + xor $Thi,$Zhi,$Zhi + stx $Zlo,[$Xi+8] + xor $rem,$Zhi,$Zhi + stx $Zhi,[$Xi] + + ret + restore +.type gcm_gmult_4bit,#function +.size gcm_gmult_4bit,(.-gcm_gmult_4bit) +.asciz "GHASH for SPARCv9, CRYPTOGAMS by " +.align 4 +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/modes/asm/ghash-x86.pl b/src/lib/libcrypto/modes/asm/ghash-x86.pl new file mode 100644 index 0000000000..6b09669d47 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-x86.pl @@ -0,0 +1,1342 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, May, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that it +# uses 256 bytes per-key table [+64/128 bytes fixed table]. It has two +# code paths: vanilla x86 and vanilla MMX. Former will be executed on +# 486 and Pentium, latter on all others. MMX GHASH features so called +# "528B" variant of "4-bit" method utilizing additional 256+16 bytes +# of per-key storage [+512 bytes shared table]. Performance results +# are for streamed GHASH subroutine and are expressed in cycles per +# processed byte, less is better: +# +# gcc 2.95.3(*) MMX assembler x86 assembler +# +# Pentium 105/111(**) - 50 +# PIII 68 /75 12.2 24 +# P4 125/125 17.8 84(***) +# Opteron 66 /70 10.1 30 +# Core2 54 /67 8.4 18 +# +# (*) gcc 3.4.x was observed to generate few percent slower code, +# which is one of reasons why 2.95.3 results were chosen, +# another reason is lack of 3.4.x results for older CPUs; +# comparison with MMX results is not completely fair, because C +# results are for vanilla "256B" implementation, while +# assembler results are for "528B";-) +# (**) second number is result for code compiled with -fPIC flag, +# which is actually more relevant, because assembler code is +# position-independent; +# (***) see comment in non-MMX routine for further details; +# +# To summarize, it's >2-5 times faster than gcc-generated code. To +# anchor it to something else SHA1 assembler processes one byte in +# 11-13 cycles on contemporary x86 cores. As for choice of MMX in +# particular, see comment at the end of the file... + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.10 cycles per processed byte. +# The question is how close is it to theoretical limit? The pclmulqdq +# instruction latency appears to be 14 cycles and there can't be more +# than 2 of them executing at any given time. This means that single +# Karatsuba multiplication would take 28 cycles *plus* few cycles for +# pre- and post-processing. Then multiplication has to be followed by +# modulo-reduction. Given that aggregated reduction method [see +# "Carry-less Multiplication and Its Usage for Computing the GCM Mode" +# white paper by Intel] allows you to perform reduction only once in +# a while we can assume that asymptotic performance can be estimated +# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction +# and Naggr is the aggregation factor. +# +# Before we proceed to this implementation let's have closer look at +# the best-performing code suggested by Intel in their white paper. +# By tracing inter-register dependencies Tmod is estimated as ~19 +# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per +# processed byte. As implied, this is quite optimistic estimate, +# because it does not account for Karatsuba pre- and post-processing, +# which for a single multiplication is ~5 cycles. Unfortunately Intel +# does not provide performance data for GHASH alone. But benchmarking +# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt +# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that +# the result accounts even for pre-computing of degrees of the hash +# key H, but its portion is negligible at 16KB buffer size. +# +# Moving on to the implementation in question. Tmod is estimated as +# ~13 cycles and Naggr is 2, giving asymptotic performance of ... +# 2.16. How is it possible that measured performance is better than +# optimistic theoretical estimate? There is one thing Intel failed +# to recognize. By serializing GHASH with CTR in same subroutine +# former's performance is really limited to above (Tmul + Tmod/Naggr) +# equation. But if GHASH procedure is detached, the modulo-reduction +# can be interleaved with Naggr-1 multiplications at instruction level +# and under ideal conditions even disappear from the equation. So that +# optimistic theoretical estimate for this implementation is ... +# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic, +# at least for such small Naggr. I'd argue that (28+Tproc/Naggr), +# where Tproc is time required for Karatsuba pre- and post-processing, +# is more realistic estimate. In this case it gives ... 1.91 cycles. +# Or in other words, depending on how well we can interleave reduction +# and one of the two multiplications the performance should be betwen +# 1.91 and 2.16. As already mentioned, this implementation processes +# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart +# - in 2.02. x86_64 performance is better, because larger register +# bank allows to interleave reduction and multiplication better. +# +# Does it make sense to increase Naggr? To start with it's virtually +# impossible in 32-bit mode, because of limited register bank +# capacity. Otherwise improvement has to be weighed agiainst slower +# setup, as well as code size and complexity increase. As even +# optimistic estimate doesn't promise 30% performance improvement, +# there are currently no plans to increase Naggr. +# +# Special thanks to David Woodhouse for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +# January 2010 +# +# Tweaked to optimize transitions between integer and FP operations +# on same XMM register, PCLMULQDQ subroutine was measured to process +# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere. +# The minor regression on Westmere is outweighed by ~15% improvement +# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in +# similar manner resulted in almost 20% degradation on Sandy Bridge, +# where original 64-bit code processes one byte in 1.95 cycles. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); +require "x86asm.pl"; + +&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386"); + +$sse2=0; +for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } + +($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx"); +$inp = "edi"; +$Htbl = "esi"; + +$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse + # than unrolled, which has to be weighted against + # 2.5x x86-specific code size reduction. + +sub x86_loop { + my $off = shift; + my $rem = "eax"; + + &mov ($Zhh,&DWP(4,$Htbl,$Zll)); + &mov ($Zhl,&DWP(0,$Htbl,$Zll)); + &mov ($Zlh,&DWP(12,$Htbl,$Zll)); + &mov ($Zll,&DWP(8,$Htbl,$Zll)); + &xor ($rem,$rem); # avoid partial register stalls on PIII + + # shrd practically kills P4, 2.5x deterioration, but P4 has + # MMX code-path to execute. shrd runs tad faster [than twice + # the shifts, move's and or's] on pre-MMX Pentium (as well as + # PIII and Core2), *but* minimizes code size, spares register + # and thus allows to fold the loop... + if (!$unroll) { + my $cnt = $inp; + &mov ($cnt,15); + &jmp (&label("x86_loop")); + &set_label("x86_loop",16); + for($i=1;$i<=2;$i++) { + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + &mov (&LB($rem),&BP($off,"esp",$cnt)); + if ($i&1) { + &and (&LB($rem),0xf0); + } else { + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + + if ($i&1) { + &dec ($cnt); + &js (&label("x86_break")); + } else { + &jmp (&label("x86_loop")); + } + } + &set_label("x86_break",16); + } else { + for($i=1;$i<32;$i++) { + &comment($i); + &mov (&LB($rem),&LB($Zll)); + &shrd ($Zll,$Zlh,4); + &and (&LB($rem),0xf); + &shrd ($Zlh,$Zhl,4); + &shrd ($Zhl,$Zhh,4); + &shr ($Zhh,4); + &xor ($Zhh,&DWP($off+16,"esp",$rem,4)); + + if ($i&1) { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &and (&LB($rem),0xf0); + } else { + &mov (&LB($rem),&BP($off+15-($i>>1),"esp")); + &shl (&LB($rem),4); + } + + &xor ($Zll,&DWP(8,$Htbl,$rem)); + &xor ($Zlh,&DWP(12,$Htbl,$rem)); + &xor ($Zhl,&DWP(0,$Htbl,$rem)); + &xor ($Zhh,&DWP(4,$Htbl,$rem)); + } + } + &bswap ($Zll); + &bswap ($Zlh); + &bswap ($Zhl); + if (!$x86only) { + &bswap ($Zhh); + } else { + &mov ("eax",$Zhh); + &bswap ("eax"); + &mov ($Zhh,"eax"); + } +} + +if ($unroll) { + &function_begin_B("_x86_gmult_4bit_inner"); + &x86_loop(4); + &ret (); + &function_end_B("_x86_gmult_4bit_inner"); +} + +sub deposit_rem_4bit { + my $bias = shift; + + &mov (&DWP($bias+0, "esp"),0x0000<<16); + &mov (&DWP($bias+4, "esp"),0x1C20<<16); + &mov (&DWP($bias+8, "esp"),0x3840<<16); + &mov (&DWP($bias+12,"esp"),0x2460<<16); + &mov (&DWP($bias+16,"esp"),0x7080<<16); + &mov (&DWP($bias+20,"esp"),0x6CA0<<16); + &mov (&DWP($bias+24,"esp"),0x48C0<<16); + &mov (&DWP($bias+28,"esp"),0x54E0<<16); + &mov (&DWP($bias+32,"esp"),0xE100<<16); + &mov (&DWP($bias+36,"esp"),0xFD20<<16); + &mov (&DWP($bias+40,"esp"),0xD940<<16); + &mov (&DWP($bias+44,"esp"),0xC560<<16); + &mov (&DWP($bias+48,"esp"),0x9180<<16); + &mov (&DWP($bias+52,"esp"),0x8DA0<<16); + &mov (&DWP($bias+56,"esp"),0xA9C0<<16); + &mov (&DWP($bias+60,"esp"),0xB5E0<<16); +} + +$suffix = $x86only ? "" : "_x86"; + +&function_begin("gcm_gmult_4bit".$suffix); + &stack_push(16+4+1); # +1 for stack alignment + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &mov ($Zhh,&DWP(0,$inp)); # load Xi[16] + &mov ($Zhl,&DWP(4,$inp)); + &mov ($Zlh,&DWP(8,$inp)); + &mov ($Zll,&DWP(12,$inp)); + + &deposit_rem_4bit(16); + + &mov (&DWP(0,"esp"),$Zhh); # copy Xi[16] on stack + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(12,"esp"),$Zll); + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(0)); + } + + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_gmult_4bit".$suffix); + +&function_begin("gcm_ghash_4bit".$suffix); + &stack_push(16+4+1); # +1 for 64-bit alignment + &mov ($Zll,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ("ecx",&wparam(3)); # load len + &add ("ecx",$inp); + &mov (&wparam(3),"ecx"); + + &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zll)); + &mov ($Zlh,&DWP(8,$Zll)); + &mov ($Zll,&DWP(12,$Zll)); + + &deposit_rem_4bit(16); + + &set_label("x86_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); # xor with input + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&DWP(12,"esp"),$Zll); # dump it on stack + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(0,"esp"),$Zhh); + + &shr ($Zll,20); + &and ($Zll,0xf0); + + if ($unroll) { + &call ("_x86_gmult_4bit_inner"); + } else { + &x86_loop(0); + &mov ($inp,&wparam(2)); + } + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &mov (&wparam(2),$inp) if (!$unroll); + &jb (&label("x86_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(0,$inp),$Zhh); + &stack_pop(16+4+1); +&function_end("gcm_ghash_4bit".$suffix); + +if (!$x86only) {{{ + +&static_label("rem_4bit"); + +if (!$sse2) {{ # pure-MMX "May" version... + +$S=12; # shift factor for rem_4bit + +&function_begin_B("_mmx_gmult_4bit_inner"); +# MMX version performs 3.5 times better on P4 (see comment in non-MMX +# routine for further details), 100% better on Opteron, ~70% better +# on Core2 and PIII... In other words effort is considered to be well +# spent... Since initial release the loop was unrolled in order to +# "liberate" register previously used as loop counter. Instead it's +# used to optimize critical path in 'Z.hi ^= rem_4bit[Z.lo&0xf]'. +# The path involves move of Z.lo from MMX to integer register, +# effective address calculation and finally merge of value to Z.hi. +# Reference to rem_4bit is scheduled so late that I had to >>4 +# rem_4bit elements. This resulted in 20-45% procent improvement +# on contemporary µ-archs. +{ + my $cnt; + my $rem_4bit = "eax"; + my @rem = ($Zhh,$Zll); + my $nhi = $Zhl; + my $nlo = $Zlh; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem[0],$Zlo); + + for ($cnt=28;$cnt>=-2;$cnt--) { + my $odd = $cnt&1; + my $nix = $odd ? $nlo : $nhi; + + &shl (&LB($nlo),4) if ($odd); + &psrlq ($Zlo,4); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nix)); + &mov (&LB($nlo),&BP($cnt/2,$inp)) if (!$odd && $cnt>=0); + &psllq ($tmp,60); + &and ($nhi,0xf0) if ($odd); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem[1],8)) if ($cnt<28); + &and ($rem[0],0xf); + &pxor ($Zhi,&QWP(0,$Htbl,$nix)); + &mov ($nhi,$nlo) if (!$odd && $cnt>=0); + &movd ($rem[1],$Zlo); + &pxor ($Zlo,$tmp); + + push (@rem,shift(@rem)); # "rotate" registers + } + + &mov ($inp,&DWP(4,$rem_4bit,$rem[1],8)); # last rem_4bit[rem] + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + &shl ($inp,4); # compensate for rem_4bit[i] being >>4 + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &xor ($Zhh,$inp); + &bswap ($Zhh); + + &ret (); +} +&function_end_B("_mmx_gmult_4bit_inner"); + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +# Streamed version performs 20% better on P4, 7% on Opteron, +# 10% on Core2 and PIII... +&function_begin("gcm_ghash_4bit_mmx"); + &mov ($Zhh,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + &mov ($inp,&wparam(2)); # load in + &mov ($Zlh,&wparam(3)); # load len + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &add ($Zlh,$inp); + &mov (&wparam(3),$Zlh); # len to point at the end of input + &stack_push(4+1); # +1 for stack alignment + + &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] + &mov ($Zhl,&DWP(4,$Zhh)); + &mov ($Zlh,&DWP(8,$Zhh)); + &mov ($Zhh,&DWP(0,$Zhh)); + &jmp (&label("mmx_outer_loop")); + + &set_label("mmx_outer_loop",16); + &xor ($Zll,&DWP(12,$inp)); + &xor ($Zhl,&DWP(4,$inp)); + &xor ($Zlh,&DWP(8,$inp)); + &xor ($Zhh,&DWP(0,$inp)); + &mov (&wparam(2),$inp); + &mov (&DWP(12,"esp"),$Zll); + &mov (&DWP(4,"esp"),$Zhl); + &mov (&DWP(8,"esp"),$Zlh); + &mov (&DWP(0,"esp"),$Zhh); + + &mov ($inp,"esp"); + &shr ($Zll,24); + + &call ("_mmx_gmult_4bit_inner"); + + &mov ($inp,&wparam(2)); + &lea ($inp,&DWP(16,$inp)); + &cmp ($inp,&wparam(3)); + &jb (&label("mmx_outer_loop")); + + &mov ($inp,&wparam(0)); # load Xi + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); + + &stack_pop(4+1); +&function_end("gcm_ghash_4bit_mmx"); + +}} else {{ # "June" MMX version... + # ... has slower "April" gcm_gmult_4bit_mmx with folded + # loop. This is done to conserve code size... +$S=16; # shift factor for rem_4bit + +sub mmx_loop() { +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron and Core2, 50% +# better on PIII... In other words effort is considered to be well +# spent... + my $inp = shift; + my $rem_4bit = shift; + my $cnt = $Zhh; + my $nhi = $Zhl; + my $nlo = $Zlh; + my $rem = $Zll; + + my ($Zlo,$Zhi) = ("mm0","mm1"); + my $tmp = "mm2"; + + &xor ($nlo,$nlo); # avoid partial register stalls on PIII + &mov ($nhi,$Zll); + &mov (&LB($nlo),&LB($nhi)); + &mov ($cnt,14); + &shl (&LB($nlo),4); + &and ($nhi,0xf0); + &movq ($Zlo,&QWP(8,$Htbl,$nlo)); + &movq ($Zhi,&QWP(0,$Htbl,$nlo)); + &movd ($rem,$Zlo); + &jmp (&label("mmx_loop")); + + &set_label("mmx_loop",16); + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &dec ($cnt); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &js (&label("mmx_break")); + + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + &jmp (&label("mmx_loop")); + + &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,32); # lower part of Zlo is already there + &movd ($Zhl,$Zhi); + &psrlq ($Zhi,32); + &movd ($Zlh,$Zlo); + &movd ($Zhh,$Zhi); + + &bswap ($Zll); + &bswap ($Zhl); + &bswap ($Zlh); + &bswap ($Zhh); +} + +&function_begin("gcm_gmult_4bit_mmx"); + &mov ($inp,&wparam(0)); # load Xi + &mov ($Htbl,&wparam(1)); # load Htable + + &call (&label("pic_point")); + &set_label("pic_point"); + &blindpop("eax"); + &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); + + &movz ($Zll,&BP(15,$inp)); + + &mmx_loop($inp,"eax"); + + &emms (); + &mov (&DWP(12,$inp),$Zll); + &mov (&DWP(4,$inp),$Zhl); + &mov (&DWP(8,$inp),$Zlh); + &mov (&DWP(0,$inp),$Zhh); +&function_end("gcm_gmult_4bit_mmx"); + +###################################################################### +# Below subroutine is "528B" variant of "4-bit" GCM GHASH function +# (see gcm128.c for details). It provides further 20-40% performance +# improvement over above mentioned "May" version. + +&static_label("rem_8bit"); + +&function_begin("gcm_ghash_4bit_mmx"); +{ my ($Zlo,$Zhi) = ("mm7","mm6"); + my $rem_8bit = "esi"; + my $Htbl = "ebx"; + + # parameter block + &mov ("eax",&wparam(0)); # Xi + &mov ("ebx",&wparam(1)); # Htable + &mov ("ecx",&wparam(2)); # inp + &mov ("edx",&wparam(3)); # len + &mov ("ebp","esp"); # original %esp + &call (&label("pic_point")); + &set_label ("pic_point"); + &blindpop ($rem_8bit); + &lea ($rem_8bit,&DWP(&label("rem_8bit")."-".&label("pic_point"),$rem_8bit)); + + &sub ("esp",512+16+16); # allocate stack frame... + &and ("esp",-64); # ...and align it + &sub ("esp",16); # place for (u8)(H[]<<4) + + &add ("edx","ecx"); # pointer to the end of input + &mov (&DWP(528+16+0,"esp"),"eax"); # save Xi + &mov (&DWP(528+16+8,"esp"),"edx"); # save inp+len + &mov (&DWP(528+16+12,"esp"),"ebp"); # save original %esp + + { my @lo = ("mm0","mm1","mm2"); + my @hi = ("mm3","mm4","mm5"); + my @tmp = ("mm6","mm7"); + my $off1=0,$off2=0,$i; + + &add ($Htbl,128); # optimize for size + &lea ("edi",&DWP(16+128,"esp")); + &lea ("ebp",&DWP(16+256+128,"esp")); + + # decompose Htable (low and high parts are kept separately), + # generate Htable[]>>4, (u8)(Htable[]<<4), save to stack... + for ($i=0;$i<18;$i++) { + + &mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16); + &movq ($lo[0],&QWP(16*$i+8-128,$Htbl)) if ($i<16); + &psllq ($tmp[1],60) if ($i>1); + &movq ($hi[0],&QWP(16*$i+0-128,$Htbl)) if ($i<16); + &por ($lo[2],$tmp[1]) if ($i>1); + &movq (&QWP($off1-128,"edi"),$lo[1]) if ($i>0 && $i<17); + &psrlq ($lo[1],4) if ($i>0 && $i<17); + &movq (&QWP($off1,"edi"),$hi[1]) if ($i>0 && $i<17); + &movq ($tmp[0],$hi[1]) if ($i>0 && $i<17); + &movq (&QWP($off2-128,"ebp"),$lo[2]) if ($i>1); + &psrlq ($hi[1],4) if ($i>0 && $i<17); + &movq (&QWP($off2,"ebp"),$hi[2]) if ($i>1); + &shl ("edx",4) if ($i<16); + &mov (&BP($i,"esp"),&LB("edx")) if ($i<16); + + unshift (@lo,pop(@lo)); # "rotate" registers + unshift (@hi,pop(@hi)); + unshift (@tmp,pop(@tmp)); + $off1 += 8 if ($i>0); + $off2 += 8 if ($i>1); + } + } + + &movq ($Zhi,&QWP(0,"eax")); + &mov ("ebx",&DWP(8,"eax")); + &mov ("edx",&DWP(12,"eax")); # load Xi + +&set_label("outer",16); + { my $nlo = "eax"; + my $dat = "edx"; + my @nhi = ("edi","ebp"); + my @rem = ("ebx","ecx"); + my @red = ("mm0","mm1","mm2"); + my $tmp = "mm3"; + + &xor ($dat,&DWP(12,"ecx")); # merge input data + &xor ("ebx",&DWP(8,"ecx")); + &pxor ($Zhi,&QWP(0,"ecx")); + &lea ("ecx",&DWP(16,"ecx")); # inp+=16 + #&mov (&DWP(528+12,"esp"),$dat); # save inp^Xi + &mov (&DWP(528+8,"esp"),"ebx"); + &movq (&QWP(528+0,"esp"),$Zhi); + &mov (&DWP(528+16+4,"esp"),"ecx"); # save inp + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &mov ($nhi[1],$nlo); + &and (&LB($nlo),0x0f); + &shr ($nhi[1],4); + &pxor ($red[0],$red[0]); + &rol ($dat,8); # next byte + &pxor ($red[1],$red[1]); + &pxor ($red[2],$red[2]); + + # Just like in "May" verson modulo-schedule for critical path in + # 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor' + # is scheduled so late that rem_8bit[] has to be shifted *right* + # by 16, which is why last argument to pinsrw is 2, which + # corresponds to <<32=<<48>>16... + for ($j=11,$i=0;$i<15;$i++) { + + if ($i>0) { + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &rol ($dat,8); # next byte + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + } else { + &movq ($Zlo,&QWP(16,"esp",$nlo,8)); + &movq ($Zhi,&QWP(16+128,"esp",$nlo,8)); + } + + &mov (&LB($nlo),&LB($dat)); + &mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0); + + &movd ($rem[0],$Zlo); + &movz ($rem[1],&LB($rem[1])) if ($i>0); + &psrlq ($Zlo,8); # Z>>=8 + + &movq ($tmp,$Zhi); + &mov ($nhi[0],$nlo); + &psrlq ($Zhi,8); + + &pxor ($Zlo,&QWP(16+256+0,"esp",$nhi[1],8)); # Z^=H[nhi]>>4 + &and (&LB($nlo),0x0f); + &psllq ($tmp,56); + + &pxor ($Zhi,$red[1]) if ($i>1); + &shr ($nhi[0],4); + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2) if ($i>0); + + unshift (@red,pop(@red)); # "rotate" registers + unshift (@rem,pop(@rem)); + unshift (@nhi,pop(@nhi)); + } + + &pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo] + &pxor ($Zhi,&QWP(16+128,"esp",$nlo,8)); + &xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4) + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8)); + &movz ($rem[1],&LB($rem[1])); + + &pxor ($red[2],$red[2]); # clear 2nd word + &psllq ($red[1],4); + + &movd ($rem[0],$Zlo); + &psrlq ($Zlo,4); # Z>>=4 + + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &shl ($rem[0],4); # rem<<4 + + &pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi] + &psllq ($tmp,60); + &movz ($rem[0],&LB($rem[0])); + + &pxor ($Zlo,$tmp); + &pxor ($Zhi,&QWP(16+128,"esp",$nhi[1],8)); + + &pinsrw ($red[0],&WP(0,$rem_8bit,$rem[1],2),2); + &pxor ($Zhi,$red[1]); + + &movd ($dat,$Zlo); + &pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48 + + &psllq ($red[0],12); # correct by <<16>>4 + &pxor ($Zhi,$red[0]); + &psrlq ($Zlo,32); + &pxor ($Zhi,$red[2]); + + &mov ("ecx",&DWP(528+16+4,"esp")); # restore inp + &movd ("ebx",$Zlo); + &movq ($tmp,$Zhi); # 01234567 + &psllw ($Zhi,8); # 1.3.5.7. + &psrlw ($tmp,8); # .0.2.4.6 + &por ($Zhi,$tmp); # 10325476 + &bswap ($dat); + &pshufw ($Zhi,$Zhi,0b00011011); # 76543210 + &bswap ("ebx"); + + &cmp ("ecx",&DWP(528+16+8,"esp")); # are we done? + &jne (&label("outer")); + } + + &mov ("eax",&DWP(528+16+0,"esp")); # restore Xi + &mov (&DWP(12,"eax"),"edx"); + &mov (&DWP(8,"eax"),"ebx"); + &movq (&QWP(0,"eax"),$Zhi); + + &mov ("esp",&DWP(528+16+12,"esp")); # restore original %esp + &emms (); +} +&function_end("gcm_ghash_4bit_mmx"); +}} + +if ($sse2) {{ +###################################################################### +# PCLMULQDQ version. + +$Xip="eax"; +$Htbl="edx"; +$const="ecx"; +$inp="esi"; +$len="ebx"; + +($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2"; +($T1,$T2,$T3)=("xmm3","xmm4","xmm5"); +($Xn,$Xhn)=("xmm6","xmm7"); + +&static_label("bswap"); + +sub clmul64x64_T2 { # minimal "register" pressure +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($Xhi,$Xi); # + &pshufd ($T1,$Xi,0b01001110); + &pshufd ($T2,$Hkey,0b01001110); + &pxor ($T1,$Xi); # + &pxor ($T2,$Hkey); + + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pclmulqdq ($T1,$T2,0x00); ####### + &xorps ($T1,$Xi); # + &xorps ($T1,$Xhi); # + + &movdqa ($T2,$T1); # + &psrldq ($T1,8); + &pslldq ($T2,8); # + &pxor ($Xhi,$T1); + &pxor ($Xi,$T2); # +} + +sub clmul64x64_T3 { +# Even though this subroutine offers visually better ILP, it +# was empirically found to be a tad slower than above version. +# At least in gcm_ghash_clmul context. But it's just as well, +# because loop modulo-scheduling is possible only thanks to +# minimized "register" pressure... +my ($Xhi,$Xi,$Hkey)=@_; + + &movdqa ($T1,$Xi); # + &movdqa ($Xhi,$Xi); + &pclmulqdq ($Xi,$Hkey,0x00); ####### + &pclmulqdq ($Xhi,$Hkey,0x11); ####### + &pshufd ($T2,$T1,0b01001110); # + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T2,$T1); # + &pxor ($T3,$Hkey); + &pclmulqdq ($T2,$T3,0x00); ####### + &pxor ($T2,$Xi); # + &pxor ($T2,$Xhi); # + + &movdqa ($T3,$T2); # + &psrldq ($T2,8); + &pslldq ($T3,8); # + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # +} + +if (1) { # Algorithm 9 with <<1 twist. + # Reduction is shorter and uses only two + # temporary registers, which makes it better + # candidate for interleaving with 64x64 + # multiplication. Pre-modulo-scheduled loop + # was found to be ~20% faster than Algorithm 5 + # below. Algorithm 9 was therefore chosen for + # further optimization... + +sub reduction_alg9 { # 17/13 times faster than Intel version +my ($Xhi,$Xi) = @_; + + # 1st phase + &movdqa ($T1,$Xi) # + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,5); # + &pxor ($Xi,$T1); # + &psllq ($Xi,57); # + &movdqa ($T2,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T2,8); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); # + + # 2nd phase + &movdqa ($T2,$Xi); + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + &pxor ($T2,$Xhi); + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # <<1 twist + &pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword + &movdqa ($T1,$Hkey); + &psllq ($Hkey,1); + &pxor ($T3,$T3); # + &psrlq ($T1,63); + &pcmpgtd ($T3,$T2); # broadcast carry bit + &pslldq ($T1,8); + &por ($Hkey,$T1); # H<<=1 + + # magic reduction + &pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial + &pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movups ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); + + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &lea ($inp,&DWP(32,$inp)); # i+=2 + &sub ($len,0x20); + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movups ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + + &movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1 + &movdqa ($Xhn,$Xn); + &pxor ($Xhi,$T1); # "Ii+Xi", consume early + + &movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase + &psllq ($Xi,1); + &pxor ($Xi,$T1); # + &psllq ($Xi,5); # + &pxor ($Xi,$T1); # + &pclmulqdq ($Xn,$Hkey,0x00); ####### + &psllq ($Xi,57); # + &movdqa ($T2,$Xi); # + &pslldq ($Xi,8); + &psrldq ($T2,8); # + &pxor ($Xi,$T1); + &pshufd ($T1,$T3,0b01001110); + &pxor ($Xhi,$T2); # + &pxor ($T1,$T3); + &pshufd ($T3,$Hkey,0b01001110); + &pxor ($T3,$Hkey); # + + &pclmulqdq ($Xhn,$Hkey,0x11); ####### + &movdqa ($T2,$Xi); # 2nd phase + &psrlq ($Xi,5); + &pxor ($Xi,$T2); # + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + &pxor ($T2,$Xhi); + &psrlq ($Xi,1); # + &pxor ($Xi,$T2); # + + &pclmulqdq ($T1,$T3,0x00); ####### + &movups ($Hkey,&QWP(16,$Htbl)); # load H^2 + &xorps ($T1,$Xn); # + &xorps ($T1,$Xhn); # + + &movdqa ($T3,$T1); # + &psrldq ($T1,8); + &pslldq ($T3,8); # + &pxor ($Xhn,$T1); + &pxor ($Xn,$T3); # + &movdqa ($T3,&QWP(0,$const)); + + &lea ($inp,&DWP(32,$inp)); + &sub ($len,0x20); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg9 ($Xhi,$Xi); + + &test ($len,$len); + &jnz (&label("done")); + + &movups ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); + +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} else { # Algorith 5. Kept for reference purposes. + +sub reduction_alg5 { # 19/16 times faster than Intel version +my ($Xhi,$Xi)=@_; + + # <<1 + &movdqa ($T1,$Xi); # + &movdqa ($T2,$Xhi); + &pslld ($Xi,1); + &pslld ($Xhi,1); # + &psrld ($T1,31); + &psrld ($T2,31); # + &movdqa ($T3,$T1); + &pslldq ($T1,4); + &psrldq ($T3,12); # + &pslldq ($T2,4); + &por ($Xhi,$T3); # + &por ($Xi,$T1); + &por ($Xhi,$T2); # + + # 1st phase + &movdqa ($T1,$Xi); + &movdqa ($T2,$Xi); + &movdqa ($T3,$Xi); # + &pslld ($T1,31); + &pslld ($T2,30); + &pslld ($Xi,25); # + &pxor ($T1,$T2); + &pxor ($T1,$Xi); # + &movdqa ($T2,$T1); # + &pslldq ($T1,12); + &psrldq ($T2,4); # + &pxor ($T3,$T1); + + # 2nd phase + &pxor ($Xhi,$T3); # + &movdqa ($Xi,$T3); + &movdqa ($T1,$T3); + &psrld ($Xi,1); # + &psrld ($T1,2); + &psrld ($T3,7); # + &pxor ($Xi,$T1); + &pxor ($Xhi,$T2); + &pxor ($Xi,$T3); # + &pxor ($Xi,$Xhi); # +} + +&function_begin_B("gcm_init_clmul"); + &mov ($Htbl,&wparam(0)); + &mov ($Xip,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Hkey,&QWP(0,$Xip)); + &pshufd ($Hkey,$Hkey,0b01001110);# dword swap + + # calculate H^2 + &movdqa ($Xi,$Hkey); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &movdqu (&QWP(0,$Htbl),$Hkey); # save H + &movdqu (&QWP(16,$Htbl),$Xi); # save H^2 + + &ret (); +&function_end_B("gcm_init_clmul"); + +&function_begin_B("gcm_gmult_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($Xn,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$Xn); + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); + &reduction_alg5 ($Xhi,$Xi); + + &pshufb ($Xi,$Xn); + &movdqu (&QWP(0,$Xip),$Xi); + + &ret (); +&function_end_B("gcm_gmult_clmul"); + +&function_begin("gcm_ghash_clmul"); + &mov ($Xip,&wparam(0)); + &mov ($Htbl,&wparam(1)); + &mov ($inp,&wparam(2)); + &mov ($len,&wparam(3)); + + &call (&label("pic")); +&set_label("pic"); + &blindpop ($const); + &lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const)); + + &movdqu ($Xi,&QWP(0,$Xip)); + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($Hkey,&QWP(0,$Htbl)); + &pshufb ($Xi,$T3); + + &sub ($len,0x10); + &jz (&label("odd_tail")); + + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); # i+=2 + &jbe (&label("even_tail")); + +&set_label("mod_loop"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + ####### + &movdqa ($T3,&QWP(0,$const)); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &movdqu ($Xn,&QWP(16,$inp)); # Ii+1 + &pshufb ($T1,$T3); + &pshufb ($Xn,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1 + &movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2 + + &sub ($len,0x20); + &lea ($inp,&DWP(32,$inp)); + &ja (&label("mod_loop")); + +&set_label("even_tail"); + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi) + + &pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi) + &pxor ($Xhi,$Xhn); + + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); + &test ($len,$len); + &jnz (&label("done")); + + &movdqu ($Hkey,&QWP(0,$Htbl)); # load H +&set_label("odd_tail"); + &movdqu ($T1,&QWP(0,$inp)); # Ii + &pshufb ($T1,$T3); + &pxor ($Xi,$T1); # Ii+Xi + + &clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg5 ($Xhi,$Xi); + + &movdqa ($T3,&QWP(0,$const)); +&set_label("done"); + &pshufb ($Xi,$T3); + &movdqu (&QWP(0,$Xip),$Xi); +&function_end("gcm_ghash_clmul"); + +} + +&set_label("bswap",64); + &data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0); + &data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial +}} # $sse2 + +&set_label("rem_4bit",64); + &data_word(0,0x0000<<$S,0,0x1C20<<$S,0,0x3840<<$S,0,0x2460<<$S); + &data_word(0,0x7080<<$S,0,0x6CA0<<$S,0,0x48C0<<$S,0,0x54E0<<$S); + &data_word(0,0xE100<<$S,0,0xFD20<<$S,0,0xD940<<$S,0,0xC560<<$S); + &data_word(0,0x9180<<$S,0,0x8DA0<<$S,0,0xA9C0<<$S,0,0xB5E0<<$S); +&set_label("rem_8bit",64); + &data_short(0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E); + &data_short(0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E); + &data_short(0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E); + &data_short(0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E); + &data_short(0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E); + &data_short(0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E); + &data_short(0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E); + &data_short(0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E); + &data_short(0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE); + &data_short(0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE); + &data_short(0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE); + &data_short(0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE); + &data_short(0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E); + &data_short(0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E); + &data_short(0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE); + &data_short(0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE); + &data_short(0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E); + &data_short(0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E); + &data_short(0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E); + &data_short(0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E); + &data_short(0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E); + &data_short(0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E); + &data_short(0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E); + &data_short(0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E); + &data_short(0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE); + &data_short(0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE); + &data_short(0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE); + &data_short(0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE); + &data_short(0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E); + &data_short(0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E); + &data_short(0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE); + &data_short(0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE); +}}} # !$x86only + +&asciz("GHASH for x86, CRYPTOGAMS by "); +&asm_finish(); + +# A question was risen about choice of vanilla MMX. Or rather why wasn't +# SSE2 chosen instead? In addition to the fact that MMX runs on legacy +# CPUs such as PIII, "4-bit" MMX version was observed to provide better +# performance than *corresponding* SSE2 one even on contemporary CPUs. +# SSE2 results were provided by Peter-Michael Hager. He maintains SSE2 +# implementation featuring full range of lookup-table sizes, but with +# per-invocation lookup table setup. Latter means that table size is +# chosen depending on how much data is to be hashed in every given call, +# more data - larger table. Best reported result for Core2 is ~4 cycles +# per processed byte out of 64KB block. This number accounts even for +# 64KB table setup overhead. As discussed in gcm128.c we choose to be +# more conservative in respect to lookup table sizes, but how do the +# results compare? Minimalistic "256B" MMX version delivers ~11 cycles +# on same platform. As also discussed in gcm128.c, next in line "8-bit +# Shoup's" or "4KB" method should deliver twice the performance of +# "256B" one, in other words not worse than ~6 cycles per byte. It +# should be also be noted that in SSE2 case improvement can be "super- +# linear," i.e. more than twice, mostly because >>8 maps to single +# instruction on SSE2 register. This is unlike "4-bit" case when >>4 +# maps to same amount of instructions in both MMX and SSE2 cases. +# Bottom line is that switch to SSE2 is considered to be justifiable +# only in case we choose to implement "8-bit" method... diff --git a/src/lib/libcrypto/modes/asm/ghash-x86_64.pl b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl new file mode 100644 index 0000000000..a5ae180882 --- /dev/null +++ b/src/lib/libcrypto/modes/asm/ghash-x86_64.pl @@ -0,0 +1,805 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse for +# providing access to a Westmere-based system on behalf of Intel +# Open Source Technology Centre. + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +# common register layout +$nlo="%rax"; +$nhi="%rbx"; +$Zlo="%r8"; +$Zhi="%r9"; +$tmp="%r10"; +$rem_4bit = "%r11"; + +$Xi="%rdi"; +$Htbl="%rsi"; + +# per-function register layout +$cnt="%rcx"; +$rem="%rdx"; + +sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or + $r =~ s/%[er]([sd]i)/%\1l/ or + $r =~ s/%[er](bp)/%\1l/ or + $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +{ my $N; + sub loop() { + my $inp = shift; + + $N++; +$code.=<<___; + xor $nlo,$nlo + xor $nhi,$nhi + mov `&LB("$Zlo")`,`&LB("$nlo")` + mov `&LB("$Zlo")`,`&LB("$nhi")` + shl \$4,`&LB("$nlo")` + mov \$14,$cnt + mov 8($Htbl,$nlo),$Zlo + mov ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + mov $Zlo,$rem + jmp .Loop$N + +.align 16 +.Loop$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + mov ($inp,$cnt),`&LB("$nlo")` + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + mov `&LB("$nlo")`,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + shl \$4,`&LB("$nlo")` + xor $tmp,$Zlo + dec $cnt + js .Lbreak$N + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + jmp .Loop$N + +.align 16 +.Lbreak$N: + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nlo),$Zlo + shl \$60,$tmp + xor ($Htbl,$nlo),$Zhi + and \$0xf0,`&LB("$nhi")` + xor ($rem_4bit,$rem,8),$Zhi + mov $Zlo,$rem + xor $tmp,$Zlo + + shr \$4,$Zlo + and \$0xf,$rem + mov $Zhi,$tmp + shr \$4,$Zhi + xor 8($Htbl,$nhi),$Zlo + shl \$60,$tmp + xor ($Htbl,$nhi),$Zhi + xor $tmp,$Zlo + xor ($rem_4bit,$rem,8),$Zhi + + bswap $Zlo + bswap $Zhi +___ +}} + +$code=<<___; +.text + +.globl gcm_gmult_4bit +.type gcm_gmult_4bit,\@function,2 +.align 16 +gcm_gmult_4bit: + push %rbx + push %rbp # %rbp and %r12 are pushed exclusively in + push %r12 # order to reuse Win64 exception handler... +.Lgmult_prologue: + + movzb 15($Xi),$Zlo + lea .Lrem_4bit(%rip),$rem_4bit +___ + &loop ($Xi); +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + mov 16(%rsp),%rbx + lea 24(%rsp),%rsp +.Lgmult_epilogue: + ret +.size gcm_gmult_4bit,.-gcm_gmult_4bit +___ + +# per-function register layout +$inp="%rdx"; +$len="%rcx"; +$rem_8bit=$rem_4bit; + +$code.=<<___; +.globl gcm_ghash_4bit +.type gcm_ghash_4bit,\@function,4 +.align 16 +gcm_ghash_4bit: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$280,%rsp +.Lghash_prologue: + mov $inp,%r14 # reassign couple of args + mov $len,%r15 +___ +{ my $inp="%r14"; + my $dat="%edx"; + my $len="%r15"; + my @nhi=("%ebx","%ecx"); + my @rem=("%r12","%r13"); + my $Hshr4="%rbp"; + + &sub ($Htbl,-128); # size optimization + &lea ($Hshr4,"16+128(%rsp)"); + { my @lo =($nlo,$nhi); + my @hi =($Zlo,$Zhi); + + &xor ($dat,$dat); + for ($i=0,$j=-2;$i<18;$i++,$j++) { + &mov ("$j(%rsp)",&LB($dat)) if ($i>1); + &or ($lo[0],$tmp) if ($i>1); + &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); + &shr ($lo[1],4) if ($i>0 && $i<17); + &mov ($tmp,$hi[1]) if ($i>0 && $i<17); + &shr ($hi[1],4) if ($i>0 && $i<17); + &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); + &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); + &shl (&LB($dat),4) if ($i>0 && $i<17); + &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); + &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); + &shl ($tmp,60) if ($i>0 && $i<17); + + push (@lo,shift(@lo)); + push (@hi,shift(@hi)); + } + } + &add ($Htbl,-128); + &mov ($Zlo,"8($Xi)"); + &mov ($Zhi,"0($Xi)"); + &add ($len,$inp); # pointer to the end of data + &lea ($rem_8bit,".Lrem_8bit(%rip)"); + &jmp (".Louter_loop"); + +$code.=".align 16\n.Louter_loop:\n"; + &xor ($Zhi,"($inp)"); + &mov ("%rdx","8($inp)"); + &lea ($inp,"16($inp)"); + &xor ("%rdx",$Zlo); + &mov ("($Xi)",$Zhi); + &mov ("8($Xi)","%rdx"); + &shr ("%rdx",32); + + &xor ($nlo,$nlo); + &rol ($dat,8); + &mov (&LB($nlo),&LB($dat)); + &movz ($nhi[0],&LB($dat)); + &shl (&LB($nlo),4); + &shr ($nhi[0],4); + + for ($j=11,$i=0;$i<15;$i++) { + &rol ($dat,8); + &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); + &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); + &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); + &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); + + &mov (&LB($nlo),&LB($dat)); + &xor ($Zlo,$tmp) if ($i>0); + &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); + + &movz ($nhi[1],&LB($dat)); + &shl (&LB($nlo),4); + &movzb ($rem[0],"(%rsp,$nhi[0])"); + + &shr ($nhi[1],4) if ($i<14); + &and ($nhi[1],0xf0) if ($i==14); + &shl ($rem[1],48) if ($i>0); + &xor ($rem[0],$Zlo); + + &mov ($tmp,$Zhi); + &xor ($Zhi,$rem[1]) if ($i>0); + &shr ($Zlo,8); + + &movz ($rem[0],&LB($rem[0])); + &mov ($dat,"$j($Xi)") if (--$j%4==0); + &shr ($Zhi,8); + + &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); + &shl ($tmp,56); + &xor ($Zhi,"($Hshr4,$nhi[0],8)"); + + unshift (@nhi,pop(@nhi)); # "rotate" registers + unshift (@rem,pop(@rem)); + } + &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); + &xor ($Zlo,"8($Htbl,$nlo)"); + &xor ($Zhi,"($Htbl,$nlo)"); + + &shl ($rem[1],48); + &xor ($Zlo,$tmp); + + &xor ($Zhi,$rem[1]); + &movz ($rem[0],&LB($Zlo)); + &shr ($Zlo,4); + + &mov ($tmp,$Zhi); + &shl (&LB($rem[0]),4); + &shr ($Zhi,4); + + &xor ($Zlo,"8($Htbl,$nhi[0])"); + &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); + &shl ($tmp,60); + + &xor ($Zhi,"($Htbl,$nhi[0])"); + &xor ($Zlo,$tmp); + &shl ($rem[0],48); + + &bswap ($Zlo); + &xor ($Zhi,$rem[0]); + + &bswap ($Zhi); + &cmp ($inp,$len); + &jb (".Louter_loop"); +} +$code.=<<___; + mov $Zlo,8($Xi) + mov $Zhi,($Xi) + + lea 280(%rsp),%rsi + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lghash_epilogue: + ret +.size gcm_ghash_4bit,.-gcm_ghash_4bit +___ + +###################################################################### +# PCLMULQDQ version. + +@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order + +($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; +($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); + +sub clmul64x64_T2 { # minimal register pressure +my ($Xhi,$Xi,$Hkey,$modulo)=@_; + +$code.=<<___ if (!defined($modulo)); + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey,$T2 + pxor $Xi,$T1 # + pxor $Hkey,$T2 +___ +$code.=<<___; + pclmulqdq \$0x00,$Hkey,$Xi ####### + pclmulqdq \$0x11,$Hkey,$Xhi ####### + pclmulqdq \$0x00,$T2,$T1 ####### + pxor $Xi,$T1 # + pxor $Xhi,$T1 # + + movdqa $T1,$T2 # + psrldq \$8,$T1 + pslldq \$8,$T2 # + pxor $T1,$Xhi + pxor $T2,$Xi # +___ +} + +sub reduction_alg9 { # 17/13 times faster than Intel version +my ($Xhi,$Xi) = @_; + +$code.=<<___; + # 1st phase + movdqa $Xi,$T1 # + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$5,$Xi # + pxor $T1,$Xi # + psllq \$57,$Xi # + movdqa $Xi,$T2 # + pslldq \$8,$Xi + psrldq \$8,$T2 # + pxor $T1,$Xi + pxor $T2,$Xhi # + + # 2nd phase + movdqa $Xi,$T2 + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $T2,$Xi # + pxor $Xhi,$T2 + psrlq \$1,$Xi # + pxor $T2,$Xi # +___ +} + +{ my ($Htbl,$Xip)=@_4args; + +$code.=<<___; +.globl gcm_init_clmul +.type gcm_init_clmul,\@abi-omnipotent +.align 16 +gcm_init_clmul: + movdqu ($Xip),$Hkey + pshufd \$0b01001110,$Hkey,$Hkey # dword swap + + # <<1 twist + pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword + movdqa $Hkey,$T1 + psllq \$1,$Hkey + pxor $T3,$T3 # + psrlq \$63,$T1 + pcmpgtd $T2,$T3 # broadcast carry bit + pslldq \$8,$T1 + por $T1,$Hkey # H<<=1 + + # magic reduction + pand .L0x1c2_polynomial(%rip),$T3 + pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial + + # calculate H^2 + movdqa $Hkey,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + movdqu $Hkey,($Htbl) # save H + movdqu $Xi,16($Htbl) # save H^2 + ret +.size gcm_init_clmul,.-gcm_init_clmul +___ +} + +{ my ($Xip,$Htbl)=@_4args; + +$code.=<<___; +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,\@abi-omnipotent +.align 16 +gcm_gmult_clmul: + movdqu ($Xip),$Xi + movdqa .Lbswap_mask(%rip),$T3 + movdqu ($Htbl),$Hkey + pshufb $T3,$Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + pshufb $T3,$Xi + movdqu $Xi,($Xip) + ret +.size gcm_gmult_clmul,.-gcm_gmult_clmul +___ +} + +{ my ($Xip,$Htbl,$inp,$len)=@_4args; + my $Xn="%xmm6"; + my $Xhn="%xmm7"; + my $Hkey2="%xmm8"; + my $T1n="%xmm9"; + my $T2n="%xmm10"; + +$code.=<<___; +.globl gcm_ghash_clmul +.type gcm_ghash_clmul,\@abi-omnipotent +.align 16 +gcm_ghash_clmul: +___ +$code.=<<___ if ($win64); +.LSEH_begin_gcm_ghash_clmul: + # I can't trust assembler to use specific encoding:-( + .byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp + .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) + .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp) + .byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp) + .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp) + .byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp) +___ +$code.=<<___; + movdqa .Lbswap_mask(%rip),$T3 + + movdqu ($Xip),$Xi + movdqu ($Htbl),$Hkey + pshufb $T3,$Xi + + sub \$0x10,$len + jz .Lodd_tail + + movdqu 16($Htbl),$Hkey2 + ####### + # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = + # [(H*Ii+1) + (H*Xi+1)] mod P = + # [(H*Ii+1) + H^2*(Ii+Xi)] mod P + # + movdqu ($inp),$T1 # Ii + movdqu 16($inp),$Xn # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xn + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1 +$code.=<<___; + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey2,$T2 + pxor $Xi,$T1 # + pxor $Hkey2,$T2 + + lea 32($inp),$inp # i+=2 + sub \$0x20,$len + jbe .Leven_tail + +.Lmod_loop: +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) +$code.=<<___; + movdqu ($inp),$T1 # Ii + pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi + + movdqu 16($inp),$Xn # Ii+1 + pshufb $T3,$T1 + pshufb $T3,$Xn + + movdqa $Xn,$Xhn # + pshufd \$0b01001110,$Xn,$T1n + pshufd \$0b01001110,$Hkey,$T2n + pxor $Xn,$T1n # + pxor $Hkey,$T2n + pxor $T1,$Xhi # "Ii+Xi", consume early + + movdqa $Xi,$T1 # 1st phase + psllq \$1,$Xi + pxor $T1,$Xi # + psllq \$5,$Xi # + pxor $T1,$Xi # + pclmulqdq \$0x00,$Hkey,$Xn ####### + psllq \$57,$Xi # + movdqa $Xi,$T2 # + pslldq \$8,$Xi + psrldq \$8,$T2 # + pxor $T1,$Xi + pxor $T2,$Xhi # + + pclmulqdq \$0x11,$Hkey,$Xhn ####### + movdqa $Xi,$T2 # 2nd phase + psrlq \$5,$Xi + pxor $T2,$Xi # + psrlq \$1,$Xi # + pxor $T2,$Xi # + pxor $Xhi,$T2 + psrlq \$1,$Xi # + pxor $T2,$Xi # + + pclmulqdq \$0x00,$T2n,$T1n ####### + movdqa $Xi,$Xhi # + pshufd \$0b01001110,$Xi,$T1 + pshufd \$0b01001110,$Hkey2,$T2 + pxor $Xi,$T1 # + pxor $Hkey2,$T2 + + pxor $Xn,$T1n # + pxor $Xhn,$T1n # + movdqa $T1n,$T2n # + psrldq \$8,$T1n + pslldq \$8,$T2n # + pxor $T1n,$Xhn + pxor $T2n,$Xn # + + lea 32($inp),$inp + sub \$0x20,$len + ja .Lmod_loop + +.Leven_tail: +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi) +$code.=<<___; + pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi) + pxor $Xhn,$Xhi +___ + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; + test $len,$len + jnz .Ldone + +.Lodd_tail: + movdqu ($inp),$T1 # Ii + pshufb $T3,$T1 + pxor $T1,$Xi # Ii+Xi +___ + &clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi) + &reduction_alg9 ($Xhi,$Xi); +$code.=<<___; +.Ldone: + pshufb $T3,$Xi + movdqu $Xi,($Xip) +___ +$code.=<<___ if ($win64); + movaps (%rsp),%xmm6 + movaps 0x10(%rsp),%xmm7 + movaps 0x20(%rsp),%xmm8 + movaps 0x30(%rsp),%xmm9 + movaps 0x40(%rsp),%xmm10 + add \$0x58,%rsp +___ +$code.=<<___; + ret +.LSEH_end_gcm_ghash_clmul: +.size gcm_ghash_clmul,.-gcm_ghash_clmul +___ +} + +$code.=<<___; +.align 64 +.Lbswap_mask: + .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: + .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.align 64 +.type .Lrem_4bit,\@object +.Lrem_4bit: + .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` + .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` + .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` + .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` +.type .Lrem_8bit,\@object +.Lrem_8bit: + .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E + .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E + .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E + .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E + .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E + .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E + .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E + .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E + .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE + .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE + .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE + .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE + .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E + .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E + .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE + .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE + .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E + .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E + .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E + .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E + .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E + .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E + .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E + .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E + .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE + .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE + .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE + .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE + .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E + .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E + .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE + .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.asciz "GHASH for x86_64, CRYPTOGAMS by " +.align 64 +___ + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lin_prologue + + lea 24(%rax),%rax # adjust "rsp" + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$`1232/8`,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_gcm_gmult_4bit + .rva .LSEH_end_gcm_gmult_4bit + .rva .LSEH_info_gcm_gmult_4bit + + .rva .LSEH_begin_gcm_ghash_4bit + .rva .LSEH_end_gcm_ghash_4bit + .rva .LSEH_info_gcm_ghash_4bit + + .rva .LSEH_begin_gcm_ghash_clmul + .rva .LSEH_end_gcm_ghash_clmul + .rva .LSEH_info_gcm_ghash_clmul + +.section .xdata +.align 8 +.LSEH_info_gcm_gmult_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData +.LSEH_info_gcm_ghash_4bit: + .byte 9,0,0,0 + .rva se_handler + .rva .Lghash_prologue,.Lghash_epilogue # HandlerData +.LSEH_info_gcm_ghash_clmul: + .byte 0x01,0x1f,0x0b,0x00 + .byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 + .byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 + .byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 + .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 + .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6 + .byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58 +___ +} + +$code =~ s/\`([^\`]*)\`/eval($1)/gem; + +print $code; + +close STDOUT; diff --git a/src/lib/libcrypto/modes/cbc128.c b/src/lib/libcrypto/modes/cbc128.c index 8f8bd563b9..3d3782cbe1 100644 --- a/src/lib/libcrypto/modes/cbc128.c +++ b/src/lib/libcrypto/modes/cbc128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include +#include "modes_lcl.h" #include #ifndef MODES_DEBUG @@ -58,12 +59,7 @@ #endif #include -#define STRICT_ALIGNMENT 1 -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT +#ifndef STRICT_ALIGNMENT # define STRICT_ALIGNMENT 0 #endif diff --git a/src/lib/libcrypto/modes/ccm128.c b/src/lib/libcrypto/modes/ccm128.c new file mode 100644 index 0000000000..c9b35e5b35 --- /dev/null +++ b/src/lib/libcrypto/modes/ccm128.c @@ -0,0 +1,441 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include +#include "modes_lcl.h" +#include + +#ifndef MODES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include + +/* First you setup M and L parameters and pass the key schedule. + * This is called once per session setup... */ +void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, + unsigned int M,unsigned int L,void *key,block128_f block) +{ + memset(ctx->nonce.c,0,sizeof(ctx->nonce.c)); + ctx->nonce.c[0] = ((u8)(L-1)&7) | (u8)(((M-2)/2)&7)<<3; + ctx->blocks = 0; + ctx->block = block; + ctx->key = key; +} + +/* !!! Following interfaces are to be called *once* per packet !!! */ + +/* Then you setup per-message nonce and pass the length of the message */ +int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, + const unsigned char *nonce,size_t nlen,size_t mlen) +{ + unsigned int L = ctx->nonce.c[0]&7; /* the L parameter */ + + if (nlen<(14-L)) return -1; /* nonce is too short */ + + if (sizeof(mlen)==8 && L>=3) { + ctx->nonce.c[8] = (u8)(mlen>>(56%(sizeof(mlen)*8))); + ctx->nonce.c[9] = (u8)(mlen>>(48%(sizeof(mlen)*8))); + ctx->nonce.c[10] = (u8)(mlen>>(40%(sizeof(mlen)*8))); + ctx->nonce.c[11] = (u8)(mlen>>(32%(sizeof(mlen)*8))); + } + else + *(u32*)(&ctx->nonce.c[8]) = 0; + + ctx->nonce.c[12] = (u8)(mlen>>24); + ctx->nonce.c[13] = (u8)(mlen>>16); + ctx->nonce.c[14] = (u8)(mlen>>8); + ctx->nonce.c[15] = (u8)mlen; + + ctx->nonce.c[0] &= ~0x40; /* clear Adata flag */ + memcpy(&ctx->nonce.c[1],nonce,14-L); + + return 0; +} + +/* Then you pass additional authentication data, this is optional */ +void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, + const unsigned char *aad,size_t alen) +{ unsigned int i; + block128_f block = ctx->block; + + if (alen==0) return; + + ctx->nonce.c[0] |= 0x40; /* set Adata flag */ + (*block)(ctx->nonce.c,ctx->cmac.c,ctx->key), + ctx->blocks++; + + if (alen<(0x10000-0x100)) { + ctx->cmac.c[0] ^= (u8)(alen>>8); + ctx->cmac.c[1] ^= (u8)alen; + i=2; + } + else if (sizeof(alen)==8 && alen>=(size_t)1<<(32%(sizeof(alen)*8))) { + ctx->cmac.c[0] ^= 0xFF; + ctx->cmac.c[1] ^= 0xFF; + ctx->cmac.c[2] ^= (u8)(alen>>(56%(sizeof(alen)*8))); + ctx->cmac.c[3] ^= (u8)(alen>>(48%(sizeof(alen)*8))); + ctx->cmac.c[4] ^= (u8)(alen>>(40%(sizeof(alen)*8))); + ctx->cmac.c[5] ^= (u8)(alen>>(32%(sizeof(alen)*8))); + ctx->cmac.c[6] ^= (u8)(alen>>24); + ctx->cmac.c[7] ^= (u8)(alen>>16); + ctx->cmac.c[8] ^= (u8)(alen>>8); + ctx->cmac.c[9] ^= (u8)alen; + i=10; + } + else { + ctx->cmac.c[0] ^= 0xFF; + ctx->cmac.c[1] ^= 0xFE; + ctx->cmac.c[2] ^= (u8)(alen>>24); + ctx->cmac.c[3] ^= (u8)(alen>>16); + ctx->cmac.c[4] ^= (u8)(alen>>8); + ctx->cmac.c[5] ^= (u8)alen; + i=6; + } + + do { + for(;i<16 && alen;++i,++aad,--alen) + ctx->cmac.c[i] ^= *aad; + (*block)(ctx->cmac.c,ctx->cmac.c,ctx->key), + ctx->blocks++; + i=0; + } while (alen); +} + +/* Finally you encrypt or decrypt the message */ + +/* counter part of nonce may not be larger than L*8 bits, + * L is not larger than 8, therefore 64-bit counter... */ +static void ctr64_inc(unsigned char *counter) { + unsigned int n=8; + u8 c; + + counter += 8; + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) return; + } while (n); +} + +int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key), + ctx->blocks++; + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; /* length mismatch */ + + ctx->blocks += ((len+15)>>3)|1; + if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ + + while (len>=16) { +#if defined(STRICT_ALIGNMENT) + union { u64 u[2]; u8 c[16]; } temp; + + memcpy (temp.c,inp,16); + ctx->cmac.u[0] ^= temp.u[0]; + ctx->cmac.u[1] ^= temp.u[1]; +#else + ctx->cmac.u[0] ^= ((u64*)inp)[0]; + ctx->cmac.u[1] ^= ((u64*)inp)[1]; +#endif + (*block)(ctx->cmac.c,ctx->cmac.c,key); + (*block)(ctx->nonce.c,scratch.c,key); + ctr64_inc(ctx->nonce.c); +#if defined(STRICT_ALIGNMENT) + temp.u[0] ^= scratch.u[0]; + temp.u[1] ^= scratch.u[1]; + memcpy(out,temp.c,16); +#else + ((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]; + ((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]; +#endif + inp += 16; + out += 16; + len -= 16; + } + + if (len) { + for (i=0; icmac.c[i] ^= inp[i]; + (*block)(ctx->cmac.c,ctx->cmac.c,key); + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; inonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key); + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; + + while (len>=16) { +#if defined(STRICT_ALIGNMENT) + union { u64 u[2]; u8 c[16]; } temp; +#endif + (*block)(ctx->nonce.c,scratch.c,key); + ctr64_inc(ctx->nonce.c); +#if defined(STRICT_ALIGNMENT) + memcpy (temp.c,inp,16); + ctx->cmac.u[0] ^= (scratch.u[0] ^= temp.u[0]); + ctx->cmac.u[1] ^= (scratch.u[1] ^= temp.u[1]); + memcpy (out,scratch.c,16); +#else + ctx->cmac.u[0] ^= (((u64*)out)[0] = scratch.u[0]^((u64*)inp)[0]); + ctx->cmac.u[1] ^= (((u64*)out)[1] = scratch.u[1]^((u64*)inp)[1]); +#endif + (*block)(ctx->cmac.c,ctx->cmac.c,key); + + inp += 16; + out += 16; + len -= 16; + } + + if (len) { + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; icmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); + (*block)(ctx->cmac.c,ctx->cmac.c,key); + } + + for (i=15-L;i<16;++i) + ctx->nonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +static void ctr64_add (unsigned char *counter,size_t inc) +{ size_t n=8, val=0; + + counter += 8; + do { + --n; + val += counter[n] + (inc&0xff); + counter[n] = (unsigned char)val; + val >>= 8; /* carry bit */ + inc >>= 8; + } while(n && (inc || val)); +} + +int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len,ccm128_f stream) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key), + ctx->blocks++; + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; /* length mismatch */ + + ctx->blocks += ((len+15)>>3)|1; + if (ctx->blocks > (U64(1)<<61)) return -2; /* too much data */ + + if ((n=len/16)) { + (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); + n *= 16; + inp += n; + out += n; + len -= n; + if (len) ctr64_add(ctx->nonce.c,n/16); + } + + if (len) { + for (i=0; icmac.c[i] ^= inp[i]; + (*block)(ctx->cmac.c,ctx->cmac.c,key); + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; inonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, + size_t len,ccm128_f stream) +{ + size_t n; + unsigned int i,L; + unsigned char flags0 = ctx->nonce.c[0]; + block128_f block = ctx->block; + void * key = ctx->key; + union { u64 u[2]; u8 c[16]; } scratch; + + if (!(flags0&0x40)) + (*block)(ctx->nonce.c,ctx->cmac.c,key); + + ctx->nonce.c[0] = L = flags0&7; + for (n=0,i=15-L;i<15;++i) { + n |= ctx->nonce.c[i]; + ctx->nonce.c[i]=0; + n <<= 8; + } + n |= ctx->nonce.c[15]; /* reconstructed length */ + ctx->nonce.c[15]=1; + + if (n!=len) return -1; + + if ((n=len/16)) { + (*stream)(inp,out,n,key,ctx->nonce.c,ctx->cmac.c); + n *= 16; + inp += n; + out += n; + len -= n; + if (len) ctr64_add(ctx->nonce.c,n/16); + } + + if (len) { + (*block)(ctx->nonce.c,scratch.c,key); + for (i=0; icmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); + (*block)(ctx->cmac.c,ctx->cmac.c,key); + } + + for (i=15-L;i<16;++i) + ctx->nonce.c[i]=0; + + (*block)(ctx->nonce.c,scratch.c,key); + ctx->cmac.u[0] ^= scratch.u[0]; + ctx->cmac.u[1] ^= scratch.u[1]; + + ctx->nonce.c[0] = flags0; + + return 0; +} + +size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx,unsigned char *tag,size_t len) +{ unsigned int M = (ctx->nonce.c[0]>>3)&7; /* the M parameter */ + + M *= 2; M += 2; + if (lencmac.c,M); + return M; +} diff --git a/src/lib/libcrypto/modes/cfb128.c b/src/lib/libcrypto/modes/cfb128.c index e5938c6137..4e6f5d35e1 100644 --- a/src/lib/libcrypto/modes/cfb128.c +++ b/src/lib/libcrypto/modes/cfb128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include +#include "modes_lcl.h" #include #ifndef MODES_DEBUG @@ -58,14 +59,6 @@ #endif #include -#define STRICT_ALIGNMENT -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT -#endif - /* The input and output encrypted as though 128bit cfb mode is being * used. The extra state information to record how much of the * 128bit block we have used is contained in *num; diff --git a/src/lib/libcrypto/modes/ctr128.c b/src/lib/libcrypto/modes/ctr128.c index 932037f551..ee642c5863 100644 --- a/src/lib/libcrypto/modes/ctr128.c +++ b/src/lib/libcrypto/modes/ctr128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include +#include "modes_lcl.h" #include #ifndef MODES_DEBUG @@ -58,17 +59,6 @@ #endif #include -typedef unsigned int u32; -typedef unsigned char u8; - -#define STRICT_ALIGNMENT -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT -#endif - /* NOTE: the IV/counter CTR mode is big-endian. The code itself * is endian-neutral. */ @@ -182,3 +172,81 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, *num=n; } + +/* increment upper 96 bits of 128-bit counter by 1 */ +static void ctr96_inc(unsigned char *counter) { + u32 n=12; + u8 c; + + do { + --n; + c = counter[n]; + ++c; + counter[n] = c; + if (c) return; + } while (n); +} + +void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], unsigned char ecount_buf[16], + unsigned int *num, ctr128_f func) +{ + unsigned int n,ctr32; + + assert(in && out && key && ecount_buf && num); + assert(*num < 16); + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n+1) % 16; + } + + ctr32 = GETU32(ivec+12); + while (len>=16) { + size_t blocks = len/16; + /* + * 1<<28 is just a not-so-small yet not-so-large number... + * Below condition is practically never met, but it has to + * be checked for code correctness. + */ + if (sizeof(size_t)>sizeof(unsigned int) && blocks>(1U<<28)) + blocks = (1U<<28); + /* + * As (*func) operates on 32-bit counter, caller + * has to handle overflow. 'if' below detects the + * overflow, which is then handled by limiting the + * amount of blocks to the exact overflow point... + */ + ctr32 += (u32)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + (*func)(in,out,blocks,key,ivec); + /* (*ctr) does not update ivec, caller does: */ + PUTU32(ivec+12,ctr32); + /* ... overflow was detected, propogate carry. */ + if (ctr32 == 0) ctr96_inc(ivec); + blocks *= 16; + len -= blocks; + out += blocks; + in += blocks; + } + if (len) { + memset(ecount_buf,0,16); + (*func)(ecount_buf,ecount_buf,1,key,ivec); + ++ctr32; + PUTU32(ivec+12,ctr32); + if (ctr32 == 0) ctr96_inc(ivec); + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + + *num=n; +} diff --git a/src/lib/libcrypto/modes/cts128.c b/src/lib/libcrypto/modes/cts128.c index e0430f9fdc..c0e1f3696c 100644 --- a/src/lib/libcrypto/modes/cts128.c +++ b/src/lib/libcrypto/modes/cts128.c @@ -5,7 +5,8 @@ * forms are granted according to the OpenSSL license. */ -#include "modes.h" +#include +#include "modes_lcl.h" #include #ifndef MODES_DEBUG @@ -23,8 +24,9 @@ * deviates from mentioned RFCs. Most notably it allows input to be * of block length and it doesn't flip the order of the last two * blocks. CTS is being discussed even in ECB context, but it's not - * adopted for any known application. This implementation complies - * with mentioned RFCs and [as such] extends CBC mode. + * adopted for any known application. This implementation provides + * two interfaces: one compliant with above mentioned RFCs and one + * compliant with the NIST proposal, both extending CBC mode. */ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, @@ -54,6 +56,34 @@ size_t CRYPTO_cts128_encrypt_block(const unsigned char *in, unsigned char *out, return len+residue; } +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block) +{ size_t residue, n; + + assert (in && out && key && ivec); + + if (len < 16) return 0; + + residue=len%16; + + len -= residue; + + CRYPTO_cbc128_encrypt(in,out,len,key,ivec,block); + + if (residue==0) return len; + + in += len; + out += len; + + for (n=0; n +#include "modes_lcl.h" +#include + +#ifndef MODES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include + +#if defined(BSWAP4) && defined(STRICT_ALIGNMENT) +/* redefine, because alignment is ensured */ +#undef GETU32 +#define GETU32(p) BSWAP4(*(const u32 *)(p)) +#undef PUTU32 +#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#endif + +#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16)) +#define REDUCE1BIT(V) do { \ + if (sizeof(size_t)==8) { \ + u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^T; \ + } \ + else { \ + u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \ + V.lo = (V.hi<<63)|(V.lo>>1); \ + V.hi = (V.hi>>1 )^((u64)T<<32); \ + } \ +} while(0) + +/* + * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should + * never be set to 8. 8 is effectively reserved for testing purposes. + * TABLE_BITS>1 are lookup-table-driven implementations referred to as + * "Shoup's" in GCM specification. In other words OpenSSL does not cover + * whole spectrum of possible table driven implementations. Why? In + * non-"Shoup's" case memory access pattern is segmented in such manner, + * that it's trivial to see that cache timing information can reveal + * fair portion of intermediate hash value. Given that ciphertext is + * always available to attacker, it's possible for him to attempt to + * deduce secret parameter H and if successful, tamper with messages + * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's + * not as trivial, but there is no reason to believe that it's resistant + * to cache-timing attack. And the thing about "8-bit" implementation is + * that it consumes 16 (sixteen) times more memory, 4KB per individual + * key + 1KB shared. Well, on pros side it should be twice as fast as + * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version + * was observed to run ~75% faster, closer to 100% for commercial + * compilers... Yet "4-bit" procedure is preferred, because it's + * believed to provide better security-performance balance and adequate + * all-round performance. "All-round" refers to things like: + * + * - shorter setup time effectively improves overall timing for + * handling short messages; + * - larger table allocation can become unbearable because of VM + * subsystem penalties (for example on Windows large enough free + * results in VM working set trimming, meaning that consequent + * malloc would immediately incur working set expansion); + * - larger table has larger cache footprint, which can affect + * performance of other code paths (not necessarily even from same + * thread in Hyper-Threading world); + * + * Value of 1 is not appropriate for performance reasons. + */ +#if TABLE_BITS==8 + +static void gcm_init_8bit(u128 Htable[256], u64 H[2]) +{ + int i, j; + u128 V; + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + + for (Htable[128]=V, i=64; i>0; i>>=1) { + REDUCE1BIT(V); + Htable[i] = V; + } + + for (i=2; i<256; i<<=1) { + u128 *Hi = Htable+i, H0 = *Hi; + for (j=1; j>8); + Z.hi = (Z.hi>>8); + if (sizeof(size_t)==8) + Z.hi ^= rem_8bit[rem]; + else + Z.hi ^= (u64)rem_8bit[rem]<<32; + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} +#define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable) + +#elif TABLE_BITS==4 + +static void gcm_init_4bit(u128 Htable[16], u64 H[2]) +{ + u128 V; +#if defined(OPENSSL_SMALL_FOOTPRINT) + int i; +#endif + + Htable[0].hi = 0; + Htable[0].lo = 0; + V.hi = H[0]; + V.lo = H[1]; + +#if defined(OPENSSL_SMALL_FOOTPRINT) + for (Htable[8]=V, i=4; i>0; i>>=1) { + REDUCE1BIT(V); + Htable[i] = V; + } + + for (i=2; i<16; i<<=1) { + u128 *Hi = Htable+i; + int j; + for (V=*Hi, j=1; j>32; + Htable[j].lo = V.hi<<32|V.hi>>32; + } + } +#endif +} + +#ifndef GHASH_ASM +static const size_t rem_4bit[16] = { + PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460), + PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0), + PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), + PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; + +static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]) +{ + u128 Z; + int cnt = 15; + size_t rem, nlo, nhi; + const union { long one; char little; } is_endian = {1}; + + nlo = ((const u8 *)Xi)[15]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt<0) break; + + nlo = ((const u8 *)Xi)[cnt]; + nhi = nlo>>4; + nlo &= 0xf; + + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} + +#if !defined(OPENSSL_SMALL_FOOTPRINT) +/* + * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for + * details... Compiler-generated code doesn't seem to give any + * performance improvement, at least not on x86[_64]. It's here + * mostly as reference and a placeholder for possible future + * non-trivial optimization[s]... + */ +static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) +{ + u128 Z; + int cnt; + size_t rem, nlo, nhi; + const union { long one; char little; } is_endian = {1}; + +#if 1 + do { + cnt = 15; + nlo = ((const u8 *)Xi)[15]; + nlo ^= inp[15]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi = Htable[nlo].hi; + Z.lo = Htable[nlo].lo; + + while (1) { + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + + if (--cnt<0) break; + + nlo = ((const u8 *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo>>4; + nlo &= 0xf; + + rem = (size_t)Z.lo&0xf; + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + if (sizeof(size_t)==8) + Z.hi ^= rem_4bit[rem]; + else + Z.hi ^= (u64)rem_4bit[rem]<<32; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + } +#else + /* + * Extra 256+16 bytes per-key plus 512 bytes shared tables + * [should] give ~50% improvement... One could have PACK()-ed + * the rem_8bit even here, but the priority is to minimize + * cache footprint... + */ + u128 Hshr4[16]; /* Htable shifted right by 4 bits */ + u8 Hshl4[16]; /* Htable shifted left by 4 bits */ + static const unsigned short rem_8bit[256] = { + 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E, + 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E, + 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E, + 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E, + 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E, + 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E, + 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E, + 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E, + 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE, + 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE, + 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE, + 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE, + 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E, + 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E, + 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE, + 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE, + 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E, + 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E, + 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E, + 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E, + 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E, + 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E, + 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E, + 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E, + 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE, + 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE, + 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE, + 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE, + 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E, + 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E, + 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE, + 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE }; + /* + * This pre-processing phase slows down procedure by approximately + * same time as it makes each loop spin faster. In other words + * single block performance is approximately same as straightforward + * "4-bit" implementation, and then it goes only faster... + */ + for (cnt=0; cnt<16; ++cnt) { + Z.hi = Htable[cnt].hi; + Z.lo = Htable[cnt].lo; + Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4); + Hshr4[cnt].hi = (Z.hi>>4); + Hshl4[cnt] = (u8)(Z.lo<<4); + } + + do { + for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) { + nlo = ((const u8 *)Xi)[cnt]; + nlo ^= inp[cnt]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + + rem = (size_t)Z.lo&0xff; + + Z.lo = (Z.hi<<56)|(Z.lo>>8); + Z.hi = (Z.hi>>8); + + Z.hi ^= Hshr4[nhi].hi; + Z.lo ^= Hshr4[nhi].lo; + Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48; + } + + nlo = ((const u8 *)Xi)[0]; + nlo ^= inp[0]; + nhi = nlo>>4; + nlo &= 0xf; + + Z.hi ^= Htable[nlo].hi; + Z.lo ^= Htable[nlo].lo; + + rem = (size_t)Z.lo&0xf; + + Z.lo = (Z.hi<<60)|(Z.lo>>4); + Z.hi = (Z.hi>>4); + + Z.hi ^= Htable[nhi].hi; + Z.lo ^= Htable[nhi].lo; + Z.hi ^= ((u64)rem_8bit[rem<<4])<<48; +#endif + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } + } while (inp+=16, len-=16); +} +#endif +#else +void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +#endif + +#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) +#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) +#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len) +/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache + * trashing effect. In other words idea is to hash data while it's + * still in L1 cache after encryption pass... */ +#define GHASH_CHUNK (3*1024) +#endif + +#else /* TABLE_BITS */ + +static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2]) +{ + u128 V,Z = { 0,0 }; + long X; + int i,j; + const long *xi = (const long *)Xi; + const union { long one; char little; } is_endian = {1}; + + V.hi = H[0]; /* H is in host byte order, no byte swapping */ + V.lo = H[1]; + + for (j=0; j<16/sizeof(long); ++j) { + if (is_endian.little) { + if (sizeof(long)==8) { +#ifdef BSWAP8 + X = (long)(BSWAP8(xi[j])); +#else + const u8 *p = (const u8 *)(xi+j); + X = (long)((u64)GETU32(p)<<32|GETU32(p+4)); +#endif + } + else { + const u8 *p = (const u8 *)(xi+j); + X = (long)GETU32(p); + } + } + else + X = xi[j]; + + for (i=0; i<8*sizeof(long); ++i, X<<=1) { + u64 M = (u64)(X>>(8*sizeof(long)-1)); + Z.hi ^= V.hi&M; + Z.lo ^= V.lo&M; + + REDUCE1BIT(V); + } + } + + if (is_endian.little) { +#ifdef BSWAP8 + Xi[0] = BSWAP8(Z.hi); + Xi[1] = BSWAP8(Z.lo); +#else + u8 *p = (u8 *)Xi; + u32 v; + v = (u32)(Z.hi>>32); PUTU32(p,v); + v = (u32)(Z.hi); PUTU32(p+4,v); + v = (u32)(Z.lo>>32); PUTU32(p+8,v); + v = (u32)(Z.lo); PUTU32(p+12,v); +#endif + } + else { + Xi[0] = Z.hi; + Xi[1] = Z.lo; + } +} +#define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u) + +#endif + +#if TABLE_BITS==4 && defined(GHASH_ASM) +# if !defined(I386_ONLY) && \ + (defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64)) +# define GHASH_ASM_X86_OR_64 +# define GCM_FUNCREF_4BIT +extern unsigned int OPENSSL_ia32cap_P[2]; + +void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]); +void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); + +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) +# define GHASH_ASM_X86 +void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); + +void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +# endif +# elif defined(__arm__) || defined(__arm) +# include "arm_arch.h" +# if __ARM_ARCH__>=7 +# define GHASH_ASM_ARM +# define GCM_FUNCREF_4BIT +void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]); +void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +# endif +# endif +#endif + +#ifdef GCM_FUNCREF_4BIT +# undef GCM_MUL +# define GCM_MUL(ctx,Xi) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable) +# ifdef GHASH +# undef GHASH +# define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len) +# endif +#endif + +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block) +{ + const union { long one; char little; } is_endian = {1}; + + memset(ctx,0,sizeof(*ctx)); + ctx->block = block; + ctx->key = key; + + (*block)(ctx->H.c,ctx->H.c,key); + + if (is_endian.little) { + /* H is stored in host byte order */ +#ifdef BSWAP8 + ctx->H.u[0] = BSWAP8(ctx->H.u[0]); + ctx->H.u[1] = BSWAP8(ctx->H.u[1]); +#else + u8 *p = ctx->H.c; + u64 hi,lo; + hi = (u64)GETU32(p) <<32|GETU32(p+4); + lo = (u64)GETU32(p+8)<<32|GETU32(p+12); + ctx->H.u[0] = hi; + ctx->H.u[1] = lo; +#endif + } + +#if TABLE_BITS==8 + gcm_init_8bit(ctx->Htable,ctx->H.u); +#elif TABLE_BITS==4 +# if defined(GHASH_ASM_X86_OR_64) +# if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2) + if (OPENSSL_ia32cap_P[0]&(1<<24) && /* check FXSR bit */ + OPENSSL_ia32cap_P[1]&(1<<1) ) { /* check PCLMULQDQ bit */ + gcm_init_clmul(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_clmul; + ctx->ghash = gcm_ghash_clmul; + return; + } +# endif + gcm_init_4bit(ctx->Htable,ctx->H.u); +# if defined(GHASH_ASM_X86) /* x86 only */ +# if defined(OPENSSL_IA32_SSE2) + if (OPENSSL_ia32cap_P[0]&(1<<25)) { /* check SSE bit */ +# else + if (OPENSSL_ia32cap_P[0]&(1<<23)) { /* check MMX bit */ +# endif + ctx->gmult = gcm_gmult_4bit_mmx; + ctx->ghash = gcm_ghash_4bit_mmx; + } else { + ctx->gmult = gcm_gmult_4bit_x86; + ctx->ghash = gcm_ghash_4bit_x86; + } +# else + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; +# endif +# elif defined(GHASH_ASM_ARM) + if (OPENSSL_armcap_P & ARMV7_NEON) { + ctx->gmult = gcm_gmult_neon; + ctx->ghash = gcm_ghash_neon; + } else { + gcm_init_4bit(ctx->Htable,ctx->H.u); + ctx->gmult = gcm_gmult_4bit; + ctx->ghash = gcm_ghash_4bit; + } +# else + gcm_init_4bit(ctx->Htable,ctx->H.u); +# endif +#endif +} + +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int ctr; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +#endif + + ctx->Yi.u[0] = 0; + ctx->Yi.u[1] = 0; + ctx->Xi.u[0] = 0; + ctx->Xi.u[1] = 0; + ctx->len.u[0] = 0; /* AAD length */ + ctx->len.u[1] = 0; /* message length */ + ctx->ares = 0; + ctx->mres = 0; + + if (len==12) { + memcpy(ctx->Yi.c,iv,12); + ctx->Yi.c[15]=1; + ctr=1; + } + else { + size_t i; + u64 len0 = len; + + while (len>=16) { + for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i]; + GCM_MUL(ctx,Yi); + iv += 16; + len -= 16; + } + if (len) { + for (i=0; iYi.c[i] ^= iv[i]; + GCM_MUL(ctx,Yi); + } + len0 <<= 3; + if (is_endian.little) { +#ifdef BSWAP8 + ctx->Yi.u[1] ^= BSWAP8(len0); +#else + ctx->Yi.c[8] ^= (u8)(len0>>56); + ctx->Yi.c[9] ^= (u8)(len0>>48); + ctx->Yi.c[10] ^= (u8)(len0>>40); + ctx->Yi.c[11] ^= (u8)(len0>>32); + ctx->Yi.c[12] ^= (u8)(len0>>24); + ctx->Yi.c[13] ^= (u8)(len0>>16); + ctx->Yi.c[14] ^= (u8)(len0>>8); + ctx->Yi.c[15] ^= (u8)(len0); +#endif + } + else + ctx->Yi.u[1] ^= len0; + + GCM_MUL(ctx,Yi); + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + } + + (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; +} + +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len) +{ + size_t i; + unsigned int n; + u64 alen = ctx->len.u[0]; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + if (ctx->len.u[1]) return -2; + + alen += len; + if (alen>(U64(1)<<61) || (sizeof(len)==8 && alenlen.u[0] = alen; + + n = ctx->ares; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(aad++); + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL(ctx,Xi); + else { + ctx->ares = n; + return 0; + } + } + +#ifdef GHASH + if ((i = (len&(size_t)-16))) { + GHASH(ctx,aad,i); + aad += i; + len -= i; + } +#else + while (len>=16) { + for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i]; + GCM_MUL(ctx,Xi); + aad += 16; + len -= 16; + } +#endif + if (len) { + n = (unsigned int)len; + for (i=0; iXi.c[i] ^= aad[i]; + } + + ctx->ares = n; + return 0; +} + +int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + +#if 0 + n = (unsigned int)mlen%16; /* alternative to ctx->mres */ +#endif + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlenlen.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16%sizeof(size_t) == 0) do { /* always true actually */ + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL(ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(STRICT_ALIGNMENT) + if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) + break; +#endif +#if defined(GHASH) && defined(GHASH_CHUNK) + while (len>=GHASH_CHUNK) { + size_t j=GHASH_CHUNK; + + while (j) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + j -= 16; + } + GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK); + len -= GHASH_CHUNK; + } + if ((i = (len&(size_t)-16))) { + size_t j=i; + + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + len -= 16; + } + GHASH(ctx,out-j,j); + } +#else + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(ctx->Xi.c+i) ^= + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + GCM_MUL(ctx,Xi); + out += 16; + in += 16; + len -= 16; + } +#endif + if (len) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; + } while(0); +#endif + for (i=0;iYi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + } + ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n]; + n = (n+1)%16; + if (n==0) + GCM_MUL(ctx,Xi); + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + block128_f block = ctx->block; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlenlen.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; +#if !defined(OPENSSL_SMALL_FOOTPRINT) + if (16%sizeof(size_t) == 0) do { /* always true actually */ + if (n) { + while (n && len) { + u8 c = *(in++); + *(out++) = c^ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL (ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(STRICT_ALIGNMENT) + if (((size_t)in|(size_t)out)%sizeof(size_t) != 0) + break; +#endif +#if defined(GHASH) && defined(GHASH_CHUNK) + while (len>=GHASH_CHUNK) { + size_t j=GHASH_CHUNK; + + GHASH(ctx,in,GHASH_CHUNK); + while (j) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + j -= 16; + } + len -= GHASH_CHUNK; + } + if ((i = (len&(size_t)-16))) { + GHASH(ctx,in,i); + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) + *(size_t *)(out+i) = + *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i); + out += 16; + in += 16; + len -= 16; + } + } +#else + while (len>=16) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + for (i=0; i<16; i+=sizeof(size_t)) { + size_t c = *(size_t *)(in+i); + *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i); + *(size_t *)(ctx->Xi.c+i) ^= c; + } + GCM_MUL(ctx,Xi); + out += 16; + in += 16; + len -= 16; + } +#endif + if (len) { + (*block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + u8 c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; + } while(0); +#endif + for (i=0;iYi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + } + c = in[i]; + out[i] = c^ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + n = (n+1)%16; + if (n==0) + GCM_MUL(ctx,Xi); + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlenlen.u[1] = mlen; + + if (ctx->ares) { + /* First call to encrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; + if (n) { + while (n && len) { + ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n]; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL(ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) + while (len>=GHASH_CHUNK) { + (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); + ctr += GHASH_CHUNK/16; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + GHASH(ctx,out,GHASH_CHUNK); + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +#endif + if ((i = (len&(size_t)-16))) { + size_t j=i/16; + + (*stream)(in,out,j,key,ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + in += i; + len -= i; +#if defined(GHASH) + GHASH(ctx,out,i); + out += i; +#else + while (j--) { + for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i]; + GCM_MUL(ctx,Xi); + out += 16; + } +#endif + } + if (len) { + (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len,ctr128_f stream) +{ + const union { long one; char little; } is_endian = {1}; + unsigned int n, ctr; + size_t i; + u64 mlen = ctx->len.u[1]; + void *key = ctx->key; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +# ifdef GHASH + void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16], + const u8 *inp,size_t len) = ctx->ghash; +# endif +#endif + + mlen += len; + if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlenlen.u[1] = mlen; + + if (ctx->ares) { + /* First call to decrypt finalizes GHASH(AAD) */ + GCM_MUL(ctx,Xi); + ctx->ares = 0; + } + + if (is_endian.little) + ctr = GETU32(ctx->Yi.c+12); + else + ctr = ctx->Yi.d[3]; + + n = ctx->mres; + if (n) { + while (n && len) { + u8 c = *(in++); + *(out++) = c^ctx->EKi.c[n]; + ctx->Xi.c[n] ^= c; + --len; + n = (n+1)%16; + } + if (n==0) GCM_MUL (ctx,Xi); + else { + ctx->mres = n; + return 0; + } + } +#if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT) + while (len>=GHASH_CHUNK) { + GHASH(ctx,in,GHASH_CHUNK); + (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c); + ctr += GHASH_CHUNK/16; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } +#endif + if ((i = (len&(size_t)-16))) { + size_t j=i/16; + +#if defined(GHASH) + GHASH(ctx,in,i); +#else + while (j--) { + size_t k; + for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k]; + GCM_MUL(ctx,Xi); + in += 16; + } + j = i/16; + in -= i; +#endif + (*stream)(in,out,j,key,ctx->Yi.c); + ctr += (unsigned int)j; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + out += i; + in += i; + len -= i; + } + if (len) { + (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key); + ++ctr; + if (is_endian.little) + PUTU32(ctx->Yi.c+12,ctr); + else + ctx->Yi.d[3] = ctr; + while (len--) { + u8 c = in[n]; + ctx->Xi.c[n] ^= c; + out[n] = c^ctx->EKi.c[n]; + ++n; + } + } + + ctx->mres = n; + return 0; +} + +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, + size_t len) +{ + const union { long one; char little; } is_endian = {1}; + u64 alen = ctx->len.u[0]<<3; + u64 clen = ctx->len.u[1]<<3; +#ifdef GCM_FUNCREF_4BIT + void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult; +#endif + + if (ctx->mres) + GCM_MUL(ctx,Xi); + + if (is_endian.little) { +#ifdef BSWAP8 + alen = BSWAP8(alen); + clen = BSWAP8(clen); +#else + u8 *p = ctx->len.c; + + ctx->len.u[0] = alen; + ctx->len.u[1] = clen; + + alen = (u64)GETU32(p) <<32|GETU32(p+4); + clen = (u64)GETU32(p+8)<<32|GETU32(p+12); +#endif + } + + ctx->Xi.u[0] ^= alen; + ctx->Xi.u[1] ^= clen; + GCM_MUL(ctx,Xi); + + ctx->Xi.u[0] ^= ctx->EK0.u[0]; + ctx->Xi.u[1] ^= ctx->EK0.u[1]; + + if (tag && len<=sizeof(ctx->Xi)) + return memcmp(ctx->Xi.c,tag,len); + else + return -1; +} + +void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) +{ + CRYPTO_gcm128_finish(ctx, NULL, 0); + memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c)); +} + +GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block) +{ + GCM128_CONTEXT *ret; + + if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT)))) + CRYPTO_gcm128_init(ret,key,block); + + return ret; +} + +void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx) +{ + if (ctx) { + OPENSSL_cleanse(ctx,sizeof(*ctx)); + OPENSSL_free(ctx); + } +} + +#if defined(SELFTEST) +#include +#include + +/* Test Case 1 */ +static const u8 K1[16], + *P1=NULL, + *A1=NULL, + IV1[12], + *C1=NULL, + T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a}; + +/* Test Case 2 */ +#define K2 K1 +#define A2 A1 +#define IV2 IV1 +static const u8 P2[16], + C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78}, + T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf}; + +/* Test Case 3 */ +#define A3 A2 +static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, + P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, + IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, + C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, + 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, + 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, + 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85}, + T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4}; + +/* Test Case 4 */ +#define K4 K3 +#define IV4 IV3 +static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, + A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, + 0xab,0xad,0xda,0xd2}, + C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c, + 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e, + 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05, + 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91}, + T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47}; + +/* Test Case 5 */ +#define K5 K4 +#define P5 P4 +#define A5 A4 +static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, + C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55, + 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23, + 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42, + 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98}, + T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb}; + +/* Test Case 6 */ +#define K6 K5 +#define P6 P5 +#define A6 A5 +static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, + 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, + 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, + 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, + C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94, + 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7, + 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f, + 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5}, + T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50}; + +/* Test Case 7 */ +static const u8 K7[24], + *P7=NULL, + *A7=NULL, + IV7[12], + *C7=NULL, + T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35}; + +/* Test Case 8 */ +#define K8 K7 +#define IV8 IV7 +#define A8 A7 +static const u8 P8[16], + C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00}, + T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb}; + +/* Test Case 9 */ +#define A9 A8 +static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, + 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c}, + P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, + IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, + C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, + 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, + 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, + 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56}, + T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14}; + +/* Test Case 10 */ +#define K10 K9 +#define IV10 IV9 +static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, + A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, + 0xab,0xad,0xda,0xd2}, + C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57, + 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c, + 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47, + 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10}, + T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c}; + +/* Test Case 11 */ +#define K11 K10 +#define P11 P10 +#define A11 A10 +static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, + C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8, + 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57, + 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9, + 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7}, + T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8}; + +/* Test Case 12 */ +#define K12 K11 +#define P12 P11 +#define A12 A11 +static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, + 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, + 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, + 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, + C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff, + 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45, + 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3, + 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b}, + T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9}; + +/* Test Case 13 */ +static const u8 K13[32], + *P13=NULL, + *A13=NULL, + IV13[12], + *C13=NULL, + T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b}; + +/* Test Case 14 */ +#define K14 K13 +#define A14 A13 +static const u8 P14[16], + IV14[12], + C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18}, + T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19}; + +/* Test Case 15 */ +#define A15 A14 +static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08, + 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08}, + P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55}, + IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88}, + C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, + 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, + 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, + 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad}, + T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c}; + +/* Test Case 16 */ +#define K16 K15 +#define IV16 IV15 +static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a, + 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72, + 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25, + 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39}, + A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef, + 0xab,0xad,0xda,0xd2}, + C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d, + 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa, + 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38, + 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62}, + T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b}; + +/* Test Case 17 */ +#define K17 K16 +#define P17 P16 +#define A17 A16 +static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad}, + C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb, + 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0, + 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78, + 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f}, + T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2}; + +/* Test Case 18 */ +#define K18 K17 +#define P18 P17 +#define A18 A17 +static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa, + 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28, + 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54, + 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b}, + C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20, + 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4, + 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde, + 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f}, + T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a}; + +#define TEST_CASE(n) do { \ + u8 out[sizeof(P##n)]; \ + AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \ + CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \ + CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ + memset(out,0,sizeof(out)); \ + if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ + if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \ + if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ + (C##n && memcmp(out,C##n,sizeof(out)))) \ + ret++, printf ("encrypt test#%d failed.\n",n); \ + CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \ + memset(out,0,sizeof(out)); \ + if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \ + if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \ + if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \ + (P##n && memcmp(out,P##n,sizeof(out)))) \ + ret++, printf ("decrypt test#%d failed.\n",n); \ + } while(0) + +int main() +{ + GCM128_CONTEXT ctx; + AES_KEY key; + int ret=0; + + TEST_CASE(1); + TEST_CASE(2); + TEST_CASE(3); + TEST_CASE(4); + TEST_CASE(5); + TEST_CASE(6); + TEST_CASE(7); + TEST_CASE(8); + TEST_CASE(9); + TEST_CASE(10); + TEST_CASE(11); + TEST_CASE(12); + TEST_CASE(13); + TEST_CASE(14); + TEST_CASE(15); + TEST_CASE(16); + TEST_CASE(17); + TEST_CASE(18); + +#ifdef OPENSSL_CPUID_OBJ + { + size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc(); + union { u64 u; u8 c[1024]; } buf; + int i; + + AES_set_encrypt_key(K1,sizeof(K1)*8,&key); + CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); + CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1)); + + CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); + start = OPENSSL_rdtsc(); + CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf)); + gcm_t = OPENSSL_rdtsc() - start; + + CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), + &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, + (block128_f)AES_encrypt); + start = OPENSSL_rdtsc(); + CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf), + &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres, + (block128_f)AES_encrypt); + ctr_t = OPENSSL_rdtsc() - start; + + printf("%.2f-%.2f=%.2f\n", + gcm_t/(double)sizeof(buf), + ctr_t/(double)sizeof(buf), + (gcm_t-ctr_t)/(double)sizeof(buf)); +#ifdef GHASH + GHASH(&ctx,buf.c,sizeof(buf)); + start = OPENSSL_rdtsc(); + for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf)); + gcm_t = OPENSSL_rdtsc() - start; + printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i); +#endif + } +#endif + + return ret; +} +#endif diff --git a/src/lib/libcrypto/modes/modes.h b/src/lib/libcrypto/modes/modes.h index af8d97d795..f18215bb2b 100644 --- a/src/lib/libcrypto/modes/modes.h +++ b/src/lib/libcrypto/modes/modes.h @@ -15,6 +15,14 @@ typedef void (*cbc128_f)(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int enc); +typedef void (*ctr128_f)(const unsigned char *in, unsigned char *out, + size_t blocks, const void *key, + const unsigned char ivec[16]); + +typedef void (*ccm128_f)(const unsigned char *in, unsigned char *out, + size_t blocks, const void *key, + const unsigned char ivec[16],unsigned char cmac[16]); + void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], block128_f block); @@ -27,6 +35,11 @@ void CRYPTO_ctr128_encrypt(const unsigned char *in, unsigned char *out, unsigned char ivec[16], unsigned char ecount_buf[16], unsigned int *num, block128_f block); +void CRYPTO_ctr128_encrypt_ctr32(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], unsigned char ecount_buf[16], + unsigned int *num, ctr128_f ctr); + void CRYPTO_ofb128_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], int *num, @@ -57,3 +70,66 @@ size_t CRYPTO_cts128_decrypt_block(const unsigned char *in, unsigned char *out, size_t CRYPTO_cts128_decrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key, unsigned char ivec[16], cbc128_f cbc); + +size_t CRYPTO_nistcts128_encrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block); +size_t CRYPTO_nistcts128_encrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc); +size_t CRYPTO_nistcts128_decrypt_block(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], block128_f block); +size_t CRYPTO_nistcts128_decrypt(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + unsigned char ivec[16], cbc128_f cbc); + +typedef struct gcm128_context GCM128_CONTEXT; + +GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block); +void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block); +void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv, + size_t len); +int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad, + size_t len); +int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len); +int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len); +int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream); +int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, + const unsigned char *in, unsigned char *out, + size_t len, ctr128_f stream); +int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag, + size_t len); +void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len); +void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx); + +typedef struct ccm128_context CCM128_CONTEXT; + +void CRYPTO_ccm128_init(CCM128_CONTEXT *ctx, + unsigned int M, unsigned int L, void *key,block128_f block); +int CRYPTO_ccm128_setiv(CCM128_CONTEXT *ctx, + const unsigned char *nonce, size_t nlen, size_t mlen); +void CRYPTO_ccm128_aad(CCM128_CONTEXT *ctx, + const unsigned char *aad, size_t alen); +int CRYPTO_ccm128_encrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len); +int CRYPTO_ccm128_decrypt(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len); +int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len, + ccm128_f stream); +int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx, + const unsigned char *inp, unsigned char *out, size_t len, + ccm128_f stream); +size_t CRYPTO_ccm128_tag(CCM128_CONTEXT *ctx, unsigned char *tag, size_t len); + +typedef struct xts128_context XTS128_CONTEXT; + +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, size_t len, int enc); diff --git a/src/lib/libcrypto/modes/modes_lcl.h b/src/lib/libcrypto/modes/modes_lcl.h new file mode 100644 index 0000000000..b6dc3c336f --- /dev/null +++ b/src/lib/libcrypto/modes/modes_lcl.h @@ -0,0 +1,131 @@ +/* ==================================================================== + * Copyright (c) 2010 The OpenSSL Project. All rights reserved. + * + * Redistribution and use is governed by OpenSSL license. + * ==================================================================== + */ + +#include + + +#if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__) +typedef __int64 i64; +typedef unsigned __int64 u64; +#define U64(C) C##UI64 +#elif defined(__arch64__) +typedef long i64; +typedef unsigned long u64; +#define U64(C) C##UL +#else +typedef long long i64; +typedef unsigned long long u64; +#define U64(C) C##ULL +#endif + +typedef unsigned int u32; +typedef unsigned char u8; + +#define STRICT_ALIGNMENT 1 +#if defined(__i386) || defined(__i386__) || \ + defined(__x86_64) || defined(__x86_64__) || \ + defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ + defined(__s390__) || defined(__s390x__) || \ + ( (defined(__arm__) || defined(__arm)) && \ + (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || \ + defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)) ) +# undef STRICT_ALIGNMENT +#endif + +#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) +#if defined(__GNUC__) && __GNUC__>=2 +# if defined(__x86_64) || defined(__x86_64__) +# define BSWAP8(x) ({ u64 ret=(x); \ + asm ("bswapq %0" \ + : "+r"(ret)); ret; }) +# define BSWAP4(x) ({ u32 ret=(x); \ + asm ("bswapl %0" \ + : "+r"(ret)); ret; }) +# elif (defined(__i386) || defined(__i386__)) && !defined(I386_ONLY) +# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ + asm ("bswapl %0; bswapl %1" \ + : "+r"(hi),"+r"(lo)); \ + (u64)hi<<32|lo; }) +# define BSWAP4(x) ({ u32 ret=(x); \ + asm ("bswapl %0" \ + : "+r"(ret)); ret; }) +# elif (defined(__arm__) || defined(__arm)) && !defined(STRICT_ALIGNMENT) +# define BSWAP8(x) ({ u32 lo=(u64)(x)>>32,hi=(x); \ + asm ("rev %0,%0; rev %1,%1" \ + : "+r"(hi),"+r"(lo)); \ + (u64)hi<<32|lo; }) +# define BSWAP4(x) ({ u32 ret; \ + asm ("rev %0,%1" \ + : "=r"(ret) : "r"((u32)(x))); \ + ret; }) +# endif +#elif defined(_MSC_VER) +# if _MSC_VER>=1300 +# pragma intrinsic(_byteswap_uint64,_byteswap_ulong) +# define BSWAP8(x) _byteswap_uint64((u64)(x)) +# define BSWAP4(x) _byteswap_ulong((u32)(x)) +# elif defined(_M_IX86) + __inline u32 _bswap4(u32 val) { + _asm mov eax,val + _asm bswap eax + } +# define BSWAP4(x) _bswap4(x) +# endif +#endif +#endif + +#if defined(BSWAP4) && !defined(STRICT_ALIGNMENT) +#define GETU32(p) BSWAP4(*(const u32 *)(p)) +#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v) +#else +#define GETU32(p) ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3]) +#define PUTU32(p,v) ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v)) +#endif + +/* GCM definitions */ + +typedef struct { u64 hi,lo; } u128; + +#ifdef TABLE_BITS +#undef TABLE_BITS +#endif +/* + * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should + * never be set to 8 [or 1]. For further information see gcm128.c. + */ +#define TABLE_BITS 4 + +struct gcm128_context { + /* Following 6 names follow names in GCM specification */ + union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,len, + Xi,H; + /* Relative position of Xi, H and pre-computed Htable is used + * in some assembler modules, i.e. don't change the order! */ +#if TABLE_BITS==8 + u128 Htable[256]; +#else + u128 Htable[16]; + void (*gmult)(u64 Xi[2],const u128 Htable[16]); + void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len); +#endif + unsigned int mres, ares; + block128_f block; + void *key; +}; + +struct xts128_context { + void *key1, *key2; + block128_f block1,block2; +}; + +struct ccm128_context { + union { u64 u[2]; u8 c[16]; } nonce, cmac; + u64 blocks; + block128_f block; + void *key; +}; + diff --git a/src/lib/libcrypto/modes/ofb128.c b/src/lib/libcrypto/modes/ofb128.c index c732e2ec58..01c01702c4 100644 --- a/src/lib/libcrypto/modes/ofb128.c +++ b/src/lib/libcrypto/modes/ofb128.c @@ -48,7 +48,8 @@ * */ -#include "modes.h" +#include +#include "modes_lcl.h" #include #ifndef MODES_DEBUG @@ -58,14 +59,6 @@ #endif #include -#define STRICT_ALIGNMENT -#if defined(__i386) || defined(__i386__) || \ - defined(__x86_64) || defined(__x86_64__) || \ - defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64) || \ - defined(__s390__) || defined(__s390x__) -# undef STRICT_ALIGNMENT -#endif - /* The input and output encrypted as though 128bit ofb mode is being * used. The extra state information to record how much of the * 128bit block we have used is contained in *num; diff --git a/src/lib/libcrypto/modes/xts128.c b/src/lib/libcrypto/modes/xts128.c new file mode 100644 index 0000000000..9cf27a25e9 --- /dev/null +++ b/src/lib/libcrypto/modes/xts128.c @@ -0,0 +1,187 @@ +/* ==================================================================== + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include +#include "modes_lcl.h" +#include + +#ifndef MODES_DEBUG +# ifndef NDEBUG +# define NDEBUG +# endif +#endif +#include + +int CRYPTO_xts128_encrypt(const XTS128_CONTEXT *ctx, const unsigned char iv[16], + const unsigned char *inp, unsigned char *out, + size_t len, int enc) +{ + const union { long one; char little; } is_endian = {1}; + union { u64 u[2]; u32 d[4]; u8 c[16]; } tweak, scratch; + unsigned int i; + + if (len<16) return -1; + + memcpy(tweak.c, iv, 16); + + (*ctx->block2)(tweak.c,tweak.c,ctx->key2); + + if (!enc && (len%16)) len-=16; + + while (len>=16) { +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c,inp,16); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; +#else + scratch.u[0] = ((u64*)inp)[0]^tweak.u[0]; + scratch.u[1] = ((u64*)inp)[1]^tweak.u[1]; +#endif + (*ctx->block1)(scratch.c,scratch.c,ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out,scratch.c,16); +#else + ((u64*)out)[0] = scratch.u[0]^=tweak.u[0]; + ((u64*)out)[1] = scratch.u[1]^=tweak.u[1]; +#endif + inp += 16; + out += 16; + len -= 16; + + if (len==0) return 0; + + if (is_endian.little) { + unsigned int carry,res; + + res = 0x87&(((int)tweak.d[3])>>31); + carry = (unsigned int)(tweak.u[0]>>63); + tweak.u[0] = (tweak.u[0]<<1)^res; + tweak.u[1] = (tweak.u[1]<<1)|carry; + } + else { + size_t c; + + for (c=0,i=0;i<16;++i) { + /*+ substitutes for |, because c is 1 bit */ + c += ((size_t)tweak.c[i])<<1; + tweak.c[i] = (u8)c; + c = c>>8; + } + tweak.c[0] ^= (u8)(0x87&(0-c)); + } + } + if (enc) { + for (i=0;iblock1)(scratch.c,scratch.c,ctx->key1); + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy(out-16,scratch.c,16); + } + else { + union { u64 u[2]; u8 c[16]; } tweak1; + + if (is_endian.little) { + unsigned int carry,res; + + res = 0x87&(((int)tweak.d[3])>>31); + carry = (unsigned int)(tweak.u[0]>>63); + tweak1.u[0] = (tweak.u[0]<<1)^res; + tweak1.u[1] = (tweak.u[1]<<1)|carry; + } + else { + size_t c; + + for (c=0,i=0;i<16;++i) { + /*+ substitutes for |, because c is 1 bit */ + c += ((size_t)tweak.c[i])<<1; + tweak1.c[i] = (u8)c; + c = c>>8; + } + tweak1.c[0] ^= (u8)(0x87&(0-c)); + } +#if defined(STRICT_ALIGNMENT) + memcpy(scratch.c,inp,16); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; +#else + scratch.u[0] = ((u64*)inp)[0]^tweak1.u[0]; + scratch.u[1] = ((u64*)inp)[1]^tweak1.u[1]; +#endif + (*ctx->block1)(scratch.c,scratch.c,ctx->key1); + scratch.u[0] ^= tweak1.u[0]; + scratch.u[1] ^= tweak1.u[1]; + + for (i=0;iblock1)(scratch.c,scratch.c,ctx->key1); +#if defined(STRICT_ALIGNMENT) + scratch.u[0] ^= tweak.u[0]; + scratch.u[1] ^= tweak.u[1]; + memcpy (out,scratch.c,16); +#else + ((u64*)out)[0] = scratch.u[0]^tweak.u[0]; + ((u64*)out)[1] = scratch.u[1]^tweak.u[1]; +#endif + } + + return 0; +} diff --git a/src/lib/libcrypto/o_init.c b/src/lib/libcrypto/o_init.c index 00ed65a6cf..db4cdc443b 100644 --- a/src/lib/libcrypto/o_init.c +++ b/src/lib/libcrypto/o_init.c @@ -3,7 +3,7 @@ * project. */ /* ==================================================================== - * Copyright (c) 2007 The OpenSSL Project. All rights reserved. + * Copyright (c) 2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -50,14 +50,14 @@ * OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * */ #include #include +#ifdef OPENSSL_FIPS +#include +#include +#endif /* Perform any essential OpenSSL initialization operations. * Currently only sets FIPS callbacks @@ -65,22 +65,18 @@ void OPENSSL_init(void) { -#ifdef OPENSSL_FIPS static int done = 0; - if (!done) - { - int_ERR_lib_init(); -#ifdef CRYPTO_MDEBUG - CRYPTO_malloc_debug_init(); -#endif -#ifdef OPENSSL_ENGINE - int_EVP_MD_init_engine_callbacks(); - int_EVP_CIPHER_init_engine_callbacks(); - int_RAND_init_engine_callbacks(); + if (done) + return; + done = 1; +#ifdef OPENSSL_FIPS + FIPS_set_locking_callbacks(CRYPTO_lock, CRYPTO_add_lock); + FIPS_set_error_callbacks(ERR_put_error, ERR_add_error_vdata); + FIPS_set_malloc_callbacks(CRYPTO_malloc, CRYPTO_free); + RAND_init_fips(); #endif - done = 1; - } +#if 0 + fprintf(stderr, "Called OPENSSL_init\n"); #endif } - diff --git a/src/lib/libcrypto/objects/obj_mac.num b/src/lib/libcrypto/objects/obj_mac.num index 8c50aac27f..1d0a7c802d 100644 --- a/src/lib/libcrypto/objects/obj_mac.num +++ b/src/lib/libcrypto/objects/obj_mac.num @@ -890,3 +890,30 @@ houseIdentifier 889 supportedAlgorithms 890 deltaRevocationList 891 dmdName 892 +id_alg_PWRI_KEK 893 +cmac 894 +aes_128_gcm 895 +aes_128_ccm 896 +id_aes128_wrap_pad 897 +aes_192_gcm 898 +aes_192_ccm 899 +id_aes192_wrap_pad 900 +aes_256_gcm 901 +aes_256_ccm 902 +id_aes256_wrap_pad 903 +aes_128_ctr 904 +aes_192_ctr 905 +aes_256_ctr 906 +id_camellia128_wrap 907 +id_camellia192_wrap 908 +id_camellia256_wrap 909 +anyExtendedKeyUsage 910 +mgf1 911 +rsassaPss 912 +aes_128_xts 913 +aes_256_xts 914 +rc4_hmac_md5 915 +aes_128_cbc_hmac_sha1 916 +aes_192_cbc_hmac_sha1 917 +aes_256_cbc_hmac_sha1 918 +rsaesOaep 919 diff --git a/src/lib/libcrypto/objects/obj_xref.c b/src/lib/libcrypto/objects/obj_xref.c index 152eca5c67..9f744bcede 100644 --- a/src/lib/libcrypto/objects/obj_xref.c +++ b/src/lib/libcrypto/objects/obj_xref.c @@ -110,8 +110,10 @@ int OBJ_find_sigid_algs(int signid, int *pdig_nid, int *ppkey_nid) #endif if (rv == NULL) return 0; - *pdig_nid = rv->hash_id; - *ppkey_nid = rv->pkey_id; + if (pdig_nid) + *pdig_nid = rv->hash_id; + if (ppkey_nid) + *ppkey_nid = rv->pkey_id; return 1; } @@ -144,7 +146,8 @@ int OBJ_find_sigid_by_algs(int *psignid, int dig_nid, int pkey_nid) #endif if (rv == NULL) return 0; - *psignid = (*rv)->sign_id; + if (psignid) + *psignid = (*rv)->sign_id; return 1; } diff --git a/src/lib/libcrypto/objects/obj_xref.h b/src/lib/libcrypto/objects/obj_xref.h index d5b9b8e198..e23938c296 100644 --- a/src/lib/libcrypto/objects/obj_xref.h +++ b/src/lib/libcrypto/objects/obj_xref.h @@ -38,10 +38,12 @@ static const nid_triple sigoid_srt[] = {NID_id_GostR3411_94_with_GostR3410_94, NID_id_GostR3411_94, NID_id_GostR3410_94}, {NID_id_GostR3411_94_with_GostR3410_94_cc, NID_id_GostR3411_94, NID_id_GostR3410_94_cc}, {NID_id_GostR3411_94_with_GostR3410_2001_cc, NID_id_GostR3411_94, NID_id_GostR3410_2001_cc}, + {NID_rsassaPss, NID_undef, NID_rsaEncryption}, }; static const nid_triple * const sigoid_srt_xref[] = { + &sigoid_srt[29], &sigoid_srt[17], &sigoid_srt[18], &sigoid_srt[0], diff --git a/src/lib/libcrypto/objects/obj_xref.txt b/src/lib/libcrypto/objects/obj_xref.txt index e45b3d34b9..cb917182ee 100644 --- a/src/lib/libcrypto/objects/obj_xref.txt +++ b/src/lib/libcrypto/objects/obj_xref.txt @@ -13,6 +13,10 @@ sha512WithRSAEncryption sha512 rsaEncryption sha224WithRSAEncryption sha224 rsaEncryption mdc2WithRSA mdc2 rsaEncryption ripemd160WithRSA ripemd160 rsaEncryption +# For PSS the digest algorithm can vary and depends on the included +# AlgorithmIdentifier. The digest "undef" indicates the public key +# method should handle this explicitly. +rsassaPss undef rsaEncryption # Alternative deprecated OIDs. By using the older "rsa" OID this # type will be recognized by not normally used. diff --git a/src/lib/libcrypto/objects/objects.txt b/src/lib/libcrypto/objects/objects.txt index e61fe60cbf..d3bfad72a2 100644 --- a/src/lib/libcrypto/objects/objects.txt +++ b/src/lib/libcrypto/objects/objects.txt @@ -166,6 +166,10 @@ pkcs1 3 : RSA-MD4 : md4WithRSAEncryption pkcs1 4 : RSA-MD5 : md5WithRSAEncryption pkcs1 5 : RSA-SHA1 : sha1WithRSAEncryption # According to PKCS #1 version 2.1 +pkcs1 7 : RSAES-OAEP : rsaesOaep +pkcs1 8 : MGF1 : mgf1 +pkcs1 10 : RSASSA-PSS : rsassaPss + pkcs1 11 : RSA-SHA256 : sha256WithRSAEncryption pkcs1 12 : RSA-SHA384 : sha384WithRSAEncryption pkcs1 13 : RSA-SHA512 : sha512WithRSAEncryption @@ -299,6 +303,7 @@ id-smime-alg 4 : id-smime-alg-RC2wrap id-smime-alg 5 : id-smime-alg-ESDH id-smime-alg 6 : id-smime-alg-CMS3DESwrap id-smime-alg 7 : id-smime-alg-CMSRC2wrap +id-smime-alg 9 : id-alg-PWRI-KEK # S/MIME Certificate Distribution id-smime-cd 1 : id-smime-cd-ldap @@ -770,6 +775,10 @@ id-ce 55 : targetInformation : X509v3 AC Targeting !Cname no-rev-avail id-ce 56 : noRevAvail : X509v3 No Revocation Available +# From RFC5280 +ext-key-usage 0 : anyExtendedKeyUsage : Any Extended Key Usage + + !Cname netscape 2 16 840 1 113730 : Netscape : Netscape Communications Corp. !Cname netscape-cert-extension @@ -846,6 +855,10 @@ aes 2 : AES-128-CBC : aes-128-cbc aes 3 : AES-128-OFB : aes-128-ofb !Cname aes-128-cfb128 aes 4 : AES-128-CFB : aes-128-cfb +aes 5 : id-aes128-wrap +aes 6 : id-aes128-GCM : aes-128-gcm +aes 7 : id-aes128-CCM : aes-128-ccm +aes 8 : id-aes128-wrap-pad aes 21 : AES-192-ECB : aes-192-ecb aes 22 : AES-192-CBC : aes-192-cbc @@ -853,6 +866,10 @@ aes 22 : AES-192-CBC : aes-192-cbc aes 23 : AES-192-OFB : aes-192-ofb !Cname aes-192-cfb128 aes 24 : AES-192-CFB : aes-192-cfb +aes 25 : id-aes192-wrap +aes 26 : id-aes192-GCM : aes-192-gcm +aes 27 : id-aes192-CCM : aes-192-ccm +aes 28 : id-aes192-wrap-pad aes 41 : AES-256-ECB : aes-256-ecb aes 42 : AES-256-CBC : aes-256-cbc @@ -860,6 +877,10 @@ aes 42 : AES-256-CBC : aes-256-cbc aes 43 : AES-256-OFB : aes-256-ofb !Cname aes-256-cfb128 aes 44 : AES-256-CFB : aes-256-cfb +aes 45 : id-aes256-wrap +aes 46 : id-aes256-GCM : aes-256-gcm +aes 47 : id-aes256-CCM : aes-256-ccm +aes 48 : id-aes256-wrap-pad # There are no OIDs for these modes... @@ -869,15 +890,16 @@ aes 44 : AES-256-CFB : aes-256-cfb : AES-128-CFB8 : aes-128-cfb8 : AES-192-CFB8 : aes-192-cfb8 : AES-256-CFB8 : aes-256-cfb8 + : AES-128-CTR : aes-128-ctr + : AES-192-CTR : aes-192-ctr + : AES-256-CTR : aes-256-ctr + : AES-128-XTS : aes-128-xts + : AES-256-XTS : aes-256-xts : DES-CFB1 : des-cfb1 : DES-CFB8 : des-cfb8 : DES-EDE3-CFB1 : des-ede3-cfb1 : DES-EDE3-CFB8 : des-ede3-cfb8 -aes 5 : id-aes128-wrap -aes 25 : id-aes192-wrap -aes 45 : id-aes256-wrap - # OIDs for SHA224, SHA256, SHA385 and SHA512, according to x9.84. !Alias nist_hashalgs nistAlgorithms 2 nist_hashalgs 1 : SHA256 : sha256 @@ -1211,6 +1233,9 @@ cryptocom 1 8 1 : id-GostR3410-2001-ParamSet-cc : GOST R 3410-2001 Parameter Se 1 2 392 200011 61 1 1 1 2 : CAMELLIA-128-CBC : camellia-128-cbc 1 2 392 200011 61 1 1 1 3 : CAMELLIA-192-CBC : camellia-192-cbc 1 2 392 200011 61 1 1 1 4 : CAMELLIA-256-CBC : camellia-256-cbc +1 2 392 200011 61 1 1 3 2 : id-camellia128-wrap +1 2 392 200011 61 1 1 3 3 : id-camellia192-wrap +1 2 392 200011 61 1 1 3 4 : id-camellia256-wrap # Definitions for Camellia cipher - ECB, CFB, OFB MODE @@ -1257,3 +1282,11 @@ kisa 1 6 : SEED-OFB : seed-ofb # There is no OID that just denotes "HMAC" oddly enough... : HMAC : hmac +# Nor CMAC either + : CMAC : cmac + +# Synthetic composite ciphersuites + : RC4-HMAC-MD5 : rc4-hmac-md5 + : AES-128-CBC-HMAC-SHA1 : aes-128-cbc-hmac-sha1 + : AES-192-CBC-HMAC-SHA1 : aes-192-cbc-hmac-sha1 + : AES-256-CBC-HMAC-SHA1 : aes-256-cbc-hmac-sha1 diff --git a/src/lib/libcrypto/ocsp/ocsp_lib.c b/src/lib/libcrypto/ocsp/ocsp_lib.c index e92b86c060..a94dc838ee 100644 --- a/src/lib/libcrypto/ocsp/ocsp_lib.c +++ b/src/lib/libcrypto/ocsp/ocsp_lib.c @@ -124,7 +124,8 @@ OCSP_CERTID *OCSP_cert_id_new(const EVP_MD *dgst, if (!(ASN1_OCTET_STRING_set(cid->issuerNameHash, md, i))) goto err; /* Calculate the issuerKey hash, excluding tag and length */ - EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL); + if (!EVP_Digest(issuerKey->data, issuerKey->length, md, &i, dgst, NULL)) + goto err; if (!(ASN1_OCTET_STRING_set(cid->issuerKeyHash, md, i))) goto err; diff --git a/src/lib/libcrypto/opensslv.h b/src/lib/libcrypto/opensslv.h index d6d61a0c7d..71be3590af 100644 --- a/src/lib/libcrypto/opensslv.h +++ b/src/lib/libcrypto/opensslv.h @@ -25,11 +25,11 @@ * (Prior to 0.9.5a beta1, a different scheme was used: MMNNFFRBB for * major minor fix final patch/beta) */ -#define OPENSSL_VERSION_NUMBER 0x1000006fL +#define OPENSSL_VERSION_NUMBER 0x1000103fL #ifdef OPENSSL_FIPS -#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.0f-fips 4 Jan 2012" +#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1c-fips 10 May 2012" #else -#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.0f 4 Jan 2012" +#define OPENSSL_VERSION_TEXT "OpenSSL 1.0.1c 10 May 2012" #endif #define OPENSSL_VERSION_PTEXT " part of " OPENSSL_VERSION_TEXT diff --git a/src/lib/libcrypto/ossl_typ.h b/src/lib/libcrypto/ossl_typ.h index 12bd7014de..ea9227f6f9 100644 --- a/src/lib/libcrypto/ossl_typ.h +++ b/src/lib/libcrypto/ossl_typ.h @@ -91,10 +91,12 @@ typedef struct asn1_string_st ASN1_TIME; typedef struct asn1_string_st ASN1_GENERALIZEDTIME; typedef struct asn1_string_st ASN1_VISIBLESTRING; typedef struct asn1_string_st ASN1_UTF8STRING; +typedef struct asn1_string_st ASN1_STRING; typedef int ASN1_BOOLEAN; typedef int ASN1_NULL; #endif +typedef struct ASN1_ITEM_st ASN1_ITEM; typedef struct asn1_pctx_st ASN1_PCTX; #ifdef OPENSSL_SYS_WIN32 diff --git a/src/lib/libcrypto/pariscid.pl b/src/lib/libcrypto/pariscid.pl new file mode 100644 index 0000000000..477ec9b87d --- /dev/null +++ b/src/lib/libcrypto/pariscid.pl @@ -0,0 +1,224 @@ +#!/usr/bin/env perl + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $ST ="std"; +} else { + $LEVEL ="1.1"; + $SIZE_T =4; + $ST ="stw"; +} + +$rp="%r2"; +$sp="%r30"; +$rv="%r28"; + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT OPENSSL_cpuid_setup,ENTRY + .ALIGN 8 +OPENSSL_cpuid_setup + .PROC + .CALLINFO NO_CALLS + .ENTRY + bv ($rp) + .EXIT + nop + .PROCEND + + .EXPORT OPENSSL_rdtsc,ENTRY + .ALIGN 8 +OPENSSL_rdtsc + .PROC + .CALLINFO NO_CALLS + .ENTRY + mfctl %cr16,$rv + bv ($rp) + .EXIT + nop + .PROCEND + + .EXPORT OPENSSL_wipe_cpu,ENTRY + .ALIGN 8 +OPENSSL_wipe_cpu + .PROC + .CALLINFO NO_CALLS + .ENTRY + xor %r0,%r0,%r1 + fcpy,dbl %fr0,%fr4 + xor %r0,%r0,%r19 + fcpy,dbl %fr0,%fr5 + xor %r0,%r0,%r20 + fcpy,dbl %fr0,%fr6 + xor %r0,%r0,%r21 + fcpy,dbl %fr0,%fr7 + xor %r0,%r0,%r22 + fcpy,dbl %fr0,%fr8 + xor %r0,%r0,%r23 + fcpy,dbl %fr0,%fr9 + xor %r0,%r0,%r24 + fcpy,dbl %fr0,%fr10 + xor %r0,%r0,%r25 + fcpy,dbl %fr0,%fr11 + xor %r0,%r0,%r26 + fcpy,dbl %fr0,%fr22 + xor %r0,%r0,%r29 + fcpy,dbl %fr0,%fr23 + xor %r0,%r0,%r31 + fcpy,dbl %fr0,%fr24 + fcpy,dbl %fr0,%fr25 + fcpy,dbl %fr0,%fr26 + fcpy,dbl %fr0,%fr27 + fcpy,dbl %fr0,%fr28 + fcpy,dbl %fr0,%fr29 + fcpy,dbl %fr0,%fr30 + fcpy,dbl %fr0,%fr31 + bv ($rp) + .EXIT + ldo 0($sp),$rv + .PROCEND +___ +{ +my $inp="%r26"; +my $len="%r25"; + +$code.=<<___; + .EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_cleanse + .PROC + .CALLINFO NO_CALLS + .ENTRY + cmpib,*= 0,$len,Ldone + nop + cmpib,*>>= 15,$len,Little + ldi $SIZE_T-1,%r1 + +Lalign + and,*<> $inp,%r1,%r28 + b,n Laligned + stb %r0,0($inp) + ldo -1($len),$len + b Lalign + ldo 1($inp),$inp + +Laligned + andcm $len,%r1,%r28 +Lot + $ST %r0,0($inp) + addib,*<> -$SIZE_T,%r28,Lot + ldo $SIZE_T($inp),$inp + + and,*<> $len,%r1,$len + b,n Ldone +Little + stb %r0,0($inp) + addib,*<> -1,$len,Little + ldo 1($inp),$inp +Ldone + bv ($rp) + .EXIT + nop + .PROCEND +___ +} +{ +my ($out,$cnt,$max)=("%r26","%r25","%r24"); +my ($tick,$lasttick)=("%r23","%r22"); +my ($diff,$lastdiff)=("%r21","%r20"); + +$code.=<<___; + .EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_instrument_bus + .PROC + .CALLINFO NO_CALLS + .ENTRY + copy $cnt,$rv + mfctl %cr16,$tick + copy $tick,$lasttick + ldi 0,$diff + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) +Loop + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + addib,<> -1,$cnt,Loop + addi 4,$out,$out + + bv ($rp) + .EXIT + sub $rv,$cnt,$rv + .PROCEND + + .EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR + .ALIGN 8 +OPENSSL_instrument_bus2 + .PROC + .CALLINFO NO_CALLS + .ENTRY + copy $cnt,$rv + sub %r0,$cnt,$cnt + + mfctl %cr16,$tick + copy $tick,$lasttick + ldi 0,$diff + + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick +Loop2 + copy $diff,$lastdiff + fdc 0($out) + ldw 0($out),$tick + add $diff,$tick,$tick + stw $tick,0($out) + + addib,= -1,$max,Ldone2 + nop + + mfctl %cr16,$tick + sub $tick,$lasttick,$diff + copy $tick,$lasttick + cmpclr,<> $lastdiff,$diff,$tick + ldi 1,$tick + + ldi 1,%r1 + xor %r1,$tick,$tick + addb,<> $tick,$cnt,Loop2 + shladd,l $tick,2,$out,$out +Ldone2 + bv ($rp) + .EXIT + add $rv,$cnt,$rv + .PROCEND +___ +} +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); +$code =~ s/,\*/,/gm if ($SIZE_T==4); +print $code; +close STDOUT; + diff --git a/src/lib/libcrypto/pem/pvkfmt.c b/src/lib/libcrypto/pem/pvkfmt.c index 5f130c4528..b1bf71a5da 100644 --- a/src/lib/libcrypto/pem/pvkfmt.c +++ b/src/lib/libcrypto/pem/pvkfmt.c @@ -709,13 +709,16 @@ static int derive_pvk_key(unsigned char *key, const unsigned char *pass, int passlen) { EVP_MD_CTX mctx; + int rv = 1; EVP_MD_CTX_init(&mctx); - EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL); - EVP_DigestUpdate(&mctx, salt, saltlen); - EVP_DigestUpdate(&mctx, pass, passlen); - EVP_DigestFinal_ex(&mctx, key, NULL); + if (!EVP_DigestInit_ex(&mctx, EVP_sha1(), NULL) + || !EVP_DigestUpdate(&mctx, salt, saltlen) + || !EVP_DigestUpdate(&mctx, pass, passlen) + || !EVP_DigestFinal_ex(&mctx, key, NULL)) + rv = 0; + EVP_MD_CTX_cleanup(&mctx); - return 1; + return rv; } @@ -727,11 +730,12 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, const unsigned char *p = *in; unsigned int magic; unsigned char *enctmp = NULL, *q; + EVP_CIPHER_CTX cctx; + EVP_CIPHER_CTX_init(&cctx); if (saltlen) { char psbuf[PEM_BUFSIZE]; unsigned char keybuf[20]; - EVP_CIPHER_CTX cctx; int enctmplen, inlen; if (cb) inlen=cb(psbuf,PEM_BUFSIZE,0,u); @@ -757,37 +761,41 @@ static EVP_PKEY *do_PVK_body(const unsigned char **in, p += 8; inlen = keylen - 8; q = enctmp + 8; - EVP_CIPHER_CTX_init(&cctx); - EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); - EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); - EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen); + if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) + goto err; + if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) + goto err; + if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, &enctmplen)) + goto err; magic = read_ledword((const unsigned char **)&q); if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) { q = enctmp + 8; memset(keybuf + 5, 0, 11); - EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, - NULL); + if (!EVP_DecryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, + NULL)) + goto err; OPENSSL_cleanse(keybuf, 20); - EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen); - EVP_DecryptFinal_ex(&cctx, q + enctmplen, - &enctmplen); + if (!EVP_DecryptUpdate(&cctx, q, &enctmplen, p, inlen)) + goto err; + if (!EVP_DecryptFinal_ex(&cctx, q + enctmplen, + &enctmplen)) + goto err; magic = read_ledword((const unsigned char **)&q); if (magic != MS_RSA2MAGIC && magic != MS_DSS2MAGIC) { - EVP_CIPHER_CTX_cleanup(&cctx); PEMerr(PEM_F_DO_PVK_BODY, PEM_R_BAD_DECRYPT); goto err; } } else OPENSSL_cleanse(keybuf, 20); - EVP_CIPHER_CTX_cleanup(&cctx); p = enctmp; } ret = b2i_PrivateKey(&p, keylen); err: + EVP_CIPHER_CTX_cleanup(&cctx); if (enctmp && saltlen) OPENSSL_free(enctmp); return ret; @@ -841,6 +849,8 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, { int outlen = 24, pklen; unsigned char *p, *salt = NULL; + EVP_CIPHER_CTX cctx; + EVP_CIPHER_CTX_init(&cctx); if (enclevel) outlen += PVK_SALTLEN; pklen = do_i2b(NULL, pk, 0); @@ -885,7 +895,6 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, { char psbuf[PEM_BUFSIZE]; unsigned char keybuf[20]; - EVP_CIPHER_CTX cctx; int enctmplen, inlen; if (cb) inlen=cb(psbuf,PEM_BUFSIZE,1,u); @@ -902,16 +911,19 @@ static int i2b_PVK(unsigned char **out, EVP_PKEY*pk, int enclevel, if (enclevel == 1) memset(keybuf + 5, 0, 11); p = salt + PVK_SALTLEN + 8; - EVP_CIPHER_CTX_init(&cctx); - EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL); + if (!EVP_EncryptInit_ex(&cctx, EVP_rc4(), NULL, keybuf, NULL)) + goto error; OPENSSL_cleanse(keybuf, 20); - EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8); - EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen); - EVP_CIPHER_CTX_cleanup(&cctx); + if (!EVP_DecryptUpdate(&cctx, p, &enctmplen, p, pklen - 8)) + goto error; + if (!EVP_DecryptFinal_ex(&cctx, p + enctmplen, &enctmplen)) + goto error; } + EVP_CIPHER_CTX_cleanup(&cctx); return outlen; error: + EVP_CIPHER_CTX_cleanup(&cctx); return -1; } diff --git a/src/lib/libcrypto/perlasm/ppc-xlate.pl b/src/lib/libcrypto/perlasm/ppc-xlate.pl index 4579671c97..a3edd982b6 100755 --- a/src/lib/libcrypto/perlasm/ppc-xlate.pl +++ b/src/lib/libcrypto/perlasm/ppc-xlate.pl @@ -31,10 +31,9 @@ my $globl = sub { $ret .= ".type $name,\@function"; last; }; - /linux.*64/ && do { $ret .= ".globl .$name\n"; - $ret .= ".type .$name,\@function\n"; + /linux.*64/ && do { $ret .= ".globl $name\n"; + $ret .= ".type $name,\@function\n"; $ret .= ".section \".opd\",\"aw\"\n"; - $ret .= ".globl $name\n"; $ret .= ".align 3\n"; $ret .= "$name:\n"; $ret .= ".quad .$name,.TOC.\@tocbase,0\n"; @@ -62,6 +61,14 @@ my $machine = sub { } ".machine $arch"; }; +my $size = sub { + if ($flavour =~ /linux.*32/) + { shift; + ".size " . join(",",@_); + } + else + { ""; } +}; my $asciz = sub { shift; my $line = join(",",@_); diff --git a/src/lib/libcrypto/perlasm/x86_64-xlate.pl b/src/lib/libcrypto/perlasm/x86_64-xlate.pl index e47116b74b..56d9b64b6f 100755 --- a/src/lib/libcrypto/perlasm/x86_64-xlate.pl +++ b/src/lib/libcrypto/perlasm/x86_64-xlate.pl @@ -62,12 +62,8 @@ my $flavour = shift; my $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -{ my ($stddev,$stdino,@junk)=stat(STDOUT); - my ($outdev,$outino,@junk)=stat($output); - - open STDOUT,">$output" || die "can't open $output: $!" - if ($stddev!=$outdev || $stdino!=$outino); -} +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); my $gas=1; $gas=0 if ($output =~ /\.asm$/); my $elf=1; $elf=0 if (!$gas); @@ -116,12 +112,16 @@ my %globals; $line = substr($line,@+[0]); $line =~ s/^\s+//; undef $self->{sz}; - if ($self->{op} =~ /^(movz)b.*/) { # movz is pain... + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... $self->{op} = $1; - $self->{sz} = "b"; + $self->{sz} = $2; } elsif ($self->{op} =~ /call|jmp/) { $self->{sz} = ""; - } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op)/) { # SSEn + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^v/) { # VEX + $self->{sz} = ""; + } elsif ($self->{op} =~ /movq/ && $line =~ /%xmm/) { $self->{sz} = ""; } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { $self->{op} = $1; @@ -246,35 +246,39 @@ my %globals; $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}, new gas requires sign extension... + use integer; + $self->{label} =~ s/(?{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg; + $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg; + if ($gas) { - # Solaris /usr/ccs/bin/as can't handle multiplications - # in $self->{label}, new gas requires sign extension... - use integer; - $self->{label} =~ s/(?{label} =~ s/([0-9]+\s*[\*\/\%]\s*[0-9]+)/eval($1)/eg; - $self->{label} =~ s/([0-9]+)/$1<<32>>32/eg; $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); if (defined($self->{index})) { - sprintf "%s%s(%%%s,%%%s,%d)",$self->{asterisk}, - $self->{label},$self->{base}, + sprintf "%s%s(%s,%%%s,%d)",$self->{asterisk}, + $self->{label}, + $self->{base}?"%$self->{base}":"", $self->{index},$self->{scale}; } else { sprintf "%s%s(%%%s)", $self->{asterisk},$self->{label},$self->{base}; } } else { - %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", q=>"QWORD$PTR" ); + %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", l=>"DWORD$PTR", + q=>"QWORD$PTR",o=>"OWORD$PTR",x=>"XMMWORD$PTR" ); $self->{label} =~ s/\./\$/g; $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); - $sz="q" if ($self->{asterisk}); + $sz="q" if ($self->{asterisk} || opcode->mnemonic() eq "movq"); + $sz="l" if (opcode->mnemonic() eq "movd"); if (defined($self->{index})) { - sprintf "%s[%s%s*%d+%s]",$szmap{$sz}, + sprintf "%s[%s%s*%d%s]",$szmap{$sz}, $self->{label}?"$self->{label}+":"", $self->{index},$self->{scale}, - $self->{base}; + $self->{base}?"+$self->{base}":""; } elsif ($self->{base} eq "rip") { sprintf "%s[%s]",$szmap{$sz},$self->{label}; } else { @@ -506,6 +510,12 @@ my %globals; } } elsif ($dir =~ /\.(text|data)/) { $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); } $line = ""; return $self; @@ -555,7 +565,8 @@ my %globals; $v.=" READONLY"; $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); } elsif ($line=~/\.CRT\$/i) { - $v.=" READONLY DWORD"; + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; } } $current_segment = $line; @@ -577,7 +588,7 @@ my %globals; $self->{value}="${decor}SEH_end_$current_function->{name}:"; $self->{value}.=":\n" if($masm); } - $self->{value}.="$current_function->{name}\tENDP" if($masm); + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); undef $current_function; } last; @@ -613,6 +624,19 @@ my %globals; .join(",",@str) if (@str); last; }; + /\.comm/ && do { my @str=split(/,\s*/,$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; } $line = ""; } @@ -625,9 +649,133 @@ my %globals; } } +sub rex { + local *opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @opcode,($rex|0x40) if ($rex); +} + +# older gas and ml64 don't handle SSE>2 instructions +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + $imm=$1; + $src=$2; + $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + $imm=$1; + $src=$2; + $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$1,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + if ($nasm) { print <<___; default rel +%define XMMWORD ___ } elsif ($masm) { print <<___; @@ -644,14 +792,22 @@ while($line=<>) { undef $label; undef $opcode; - undef $sz; undef @args; if ($label=label->re(\$line)) { print $label->out(); } if (directive->re(\$line)) { printf "%s",directive->out(); - } elsif ($opcode=opcode->re(\$line)) { ARGUMENT: while (1) { + } elsif ($opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + undef @bytes; + + if ((ref($asm) eq 'CODE') && scalar(@bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + ARGUMENT: while (1) { my $arg; if ($arg=register->re(\$line)) { opcode->size($arg->size()); } @@ -667,19 +823,26 @@ while($line=<>) { $line =~ s/^,\s*//; } # ARGUMENT: - $sz=opcode->size(); - if ($#args>=0) { my $insn; + my $sz=opcode->size(); + if ($gas) { $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); } else { $insn = $opcode->out(); - $insn .= $sz if (map($_->out() =~ /x?mm/,@args)); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } @args = reverse(@args); undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); } - printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); } else { printf "\t%s",$opcode->out(); } diff --git a/src/lib/libcrypto/perlasm/x86asm.pl b/src/lib/libcrypto/perlasm/x86asm.pl index 28080caaa6..eb543db2f6 100644 --- a/src/lib/libcrypto/perlasm/x86asm.pl +++ b/src/lib/libcrypto/perlasm/x86asm.pl @@ -80,6 +80,57 @@ sub ::movq { &::generic("movq",@_); } } +# SSE>2 instructions +my %regrm = ( "eax"=>0, "ecx"=>1, "edx"=>2, "ebx"=>3, + "esp"=>4, "ebp"=>5, "esi"=>6, "edi"=>7 ); +sub ::pextrd +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /(e[a-dsd][ixp]):xmm([0-7])/) + { &::data_byte(0x66,0x0f,0x3a,0x16,0xc0|($2<<3)|$regrm{$1},$imm); } + else + { &::generic("pextrd",@_); } +} + +sub ::pinsrd +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):(e[a-dsd][ixp])/) + { &::data_byte(0x66,0x0f,0x3a,0x22,0xc0|($1<<3)|$regrm{$2},$imm); } + else + { &::generic("pinsrd",@_); } +} + +sub ::pshufb +{ my($dst,$src)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &data_byte(0x66,0x0f,0x38,0x00,0xc0|($1<<3)|$2); } + else + { &::generic("pshufb",@_); } +} + +sub ::palignr +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &::data_byte(0x66,0x0f,0x3a,0x0f,0xc0|($1<<3)|$2,$imm); } + else + { &::generic("palignr",@_); } +} + +sub ::pclmulqdq +{ my($dst,$src,$imm)=@_; + if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) + { &::data_byte(0x66,0x0f,0x3a,0x44,0xc0|($1<<3)|$2,$imm); } + else + { &::generic("pclmulqdq",@_); } +} + +sub ::rdrand +{ my ($dst)=@_; + if ($dst =~ /(e[a-dsd][ixp])/) + { &::data_byte(0x0f,0xc7,0xf0|$regrm{$dst}); } + else + { &::generic("rdrand",@_); } +} + # label management $lbdecor="L"; # local label decoration, set by package $label="000"; @@ -167,7 +218,7 @@ sub ::asm_init $filename=$fn; $i386=$cpu; - $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=0; + $elf=$cpp=$coff=$aout=$macosx=$win32=$netware=$mwerks=$android=0; if (($type eq "elf")) { $elf=1; require "x86gas.pl"; } elsif (($type eq "a\.out")) @@ -184,6 +235,8 @@ sub ::asm_init { $win32=1; require "x86masm.pl"; } elsif (($type eq "macosx")) { $aout=1; $macosx=1; require "x86gas.pl"; } + elsif (($type eq "android")) + { $elf=1; $android=1; require "x86gas.pl"; } else { print STDERR <<"EOF"; Pick one target type from diff --git a/src/lib/libcrypto/perlasm/x86gas.pl b/src/lib/libcrypto/perlasm/x86gas.pl index 6eab727fd4..682a3a3163 100644 --- a/src/lib/libcrypto/perlasm/x86gas.pl +++ b/src/lib/libcrypto/perlasm/x86gas.pl @@ -45,9 +45,8 @@ sub ::generic undef $suffix if ($dst =~ m/^%[xm]/o || $src =~ m/^%[xm]/o); if ($#_==0) { &::emit($opcode); } - elsif ($opcode =~ m/^j/o && $#_==1) { &::emit($opcode,@arg); } - elsif ($opcode eq "call" && $#_==1) { &::emit($opcode,@arg); } - elsif ($opcode =~ m/^set/&& $#_==1) { &::emit($opcode,@arg); } + elsif ($#_==1 && $opcode =~ m/^(call|clflush|j|loop|set)/o) + { &::emit($opcode,@arg); } else { &::emit($opcode.$suffix,@arg);} 1; @@ -91,6 +90,7 @@ sub ::DWP } sub ::QWP { &::DWP(@_); } sub ::BP { &::DWP(@_); } +sub ::WP { &::DWP(@_); } sub ::BC { @_; } sub ::DWC { @_; } @@ -149,22 +149,24 @@ sub ::public_label { push(@out,".globl\t".&::LABEL($_[0],$nmdecor.$_[0])."\n"); } sub ::file_end -{ if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { - my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,4"; - if ($::elf) { push (@out,"$tmp,4\n"); } - else { push (@out,"$tmp\n"); } - } - if ($::macosx) +{ if ($::macosx) { if (%non_lazy_ptr) { push(@out,".section __IMPORT,__pointers,non_lazy_symbol_pointers\n"); foreach $i (keys %non_lazy_ptr) { push(@out,"$non_lazy_ptr{$i}:\n.indirect_symbol\t$i\n.long\t0\n"); } } } + if (grep {/\b${nmdecor}OPENSSL_ia32cap_P\b/i} @out) { + my $tmp=".comm\t${nmdecor}OPENSSL_ia32cap_P,8"; + if ($::macosx) { push (@out,"$tmp,2\n"); } + elsif ($::elf) { push (@out,"$tmp,4\n"); } + else { push (@out,"$tmp\n"); } + } push(@out,$initseg) if ($initseg); } sub ::data_byte { push(@out,".byte\t".join(',',@_)."\n"); } +sub ::data_short{ push(@out,".value\t".join(',',@_)."\n"); } sub ::data_word { push(@out,".long\t".join(',',@_)."\n"); } sub ::align @@ -180,7 +182,7 @@ sub ::align sub ::picmeup { my($dst,$sym,$base,$reflabel)=@_; - if ($::pic && ($::elf || $::aout)) + if (($::pic && ($::elf || $::aout)) || $::macosx) { if (!defined($base)) { &::call(&::label("PIC_me_up")); &::set_label("PIC_me_up"); @@ -206,13 +208,17 @@ sub ::picmeup sub ::initseg { my $f=$nmdecor.shift; - if ($::elf) + if ($::android) + { $initseg.=<<___; +.section .init_array +.align 4 +.long $f +___ + } + elsif ($::elf) { $initseg.=<<___; .section .init call $f - jmp .Linitalign -.align $align -.Linitalign: ___ } elsif ($::coff) diff --git a/src/lib/libcrypto/pkcs12/p12_decr.c b/src/lib/libcrypto/pkcs12/p12_decr.c index ba77dbbe32..9d3557e8d7 100644 --- a/src/lib/libcrypto/pkcs12/p12_decr.c +++ b/src/lib/libcrypto/pkcs12/p12_decr.c @@ -89,7 +89,14 @@ unsigned char * PKCS12_pbe_crypt(X509_ALGOR *algor, const char *pass, goto err; } - EVP_CipherUpdate(&ctx, out, &i, in, inlen); + if (!EVP_CipherUpdate(&ctx, out, &i, in, inlen)) + { + OPENSSL_free(out); + out = NULL; + PKCS12err(PKCS12_F_PKCS12_PBE_CRYPT,ERR_R_EVP_LIB); + goto err; + } + outlen = i; if(!EVP_CipherFinal_ex(&ctx, out + i, &i)) { OPENSSL_free(out); diff --git a/src/lib/libcrypto/pkcs12/p12_key.c b/src/lib/libcrypto/pkcs12/p12_key.c index 424203f648..c55c7b60b3 100644 --- a/src/lib/libcrypto/pkcs12/p12_key.c +++ b/src/lib/libcrypto/pkcs12/p12_key.c @@ -152,14 +152,16 @@ int PKCS12_key_gen_uni(unsigned char *pass, int passlen, unsigned char *salt, for (i = 0; i < Slen; i++) *p++ = salt[i % saltlen]; for (i = 0; i < Plen; i++) *p++ = pass[i % passlen]; for (;;) { - EVP_DigestInit_ex(&ctx, md_type, NULL); - EVP_DigestUpdate(&ctx, D, v); - EVP_DigestUpdate(&ctx, I, Ilen); - EVP_DigestFinal_ex(&ctx, Ai, NULL); + if (!EVP_DigestInit_ex(&ctx, md_type, NULL) + || !EVP_DigestUpdate(&ctx, D, v) + || !EVP_DigestUpdate(&ctx, I, Ilen) + || !EVP_DigestFinal_ex(&ctx, Ai, NULL)) + goto err; for (j = 1; j < iter; j++) { - EVP_DigestInit_ex(&ctx, md_type, NULL); - EVP_DigestUpdate(&ctx, Ai, u); - EVP_DigestFinal_ex(&ctx, Ai, NULL); + if (!EVP_DigestInit_ex(&ctx, md_type, NULL) + || !EVP_DigestUpdate(&ctx, Ai, u) + || !EVP_DigestFinal_ex(&ctx, Ai, NULL)) + goto err; } memcpy (out, Ai, min (n, u)); if (u >= n) { diff --git a/src/lib/libcrypto/pkcs12/p12_kiss.c b/src/lib/libcrypto/pkcs12/p12_kiss.c index 292cc3ed4a..206b1b0b18 100644 --- a/src/lib/libcrypto/pkcs12/p12_kiss.c +++ b/src/lib/libcrypto/pkcs12/p12_kiss.c @@ -167,7 +167,7 @@ int PKCS12_parse(PKCS12 *p12, const char *pass, EVP_PKEY **pkey, X509 **cert, if (cert && *cert) X509_free(*cert); if (x) - X509_free(*cert); + X509_free(x); if (ocerts) sk_X509_pop_free(ocerts, X509_free); return 0; diff --git a/src/lib/libcrypto/pkcs12/p12_mutl.c b/src/lib/libcrypto/pkcs12/p12_mutl.c index 9ab740d51f..96de1bd11e 100644 --- a/src/lib/libcrypto/pkcs12/p12_mutl.c +++ b/src/lib/libcrypto/pkcs12/p12_mutl.c @@ -97,10 +97,14 @@ int PKCS12_gen_mac(PKCS12 *p12, const char *pass, int passlen, return 0; } HMAC_CTX_init(&hmac); - HMAC_Init_ex(&hmac, key, md_size, md_type, NULL); - HMAC_Update(&hmac, p12->authsafes->d.data->data, - p12->authsafes->d.data->length); - HMAC_Final(&hmac, mac, maclen); + if (!HMAC_Init_ex(&hmac, key, md_size, md_type, NULL) + || !HMAC_Update(&hmac, p12->authsafes->d.data->data, + p12->authsafes->d.data->length) + || !HMAC_Final(&hmac, mac, maclen)) + { + HMAC_CTX_cleanup(&hmac); + return 0; + } HMAC_CTX_cleanup(&hmac); return 1; } diff --git a/src/lib/libcrypto/pkcs7/pk7_doit.c b/src/lib/libcrypto/pkcs7/pk7_doit.c index 3bf1a367bb..77fda3b82a 100644 --- a/src/lib/libcrypto/pkcs7/pk7_doit.c +++ b/src/lib/libcrypto/pkcs7/pk7_doit.c @@ -204,11 +204,11 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen, unsigned char *ek = NULL; size_t eklen; - int ret = 0; + int ret = -1; pctx = EVP_PKEY_CTX_new(pkey, NULL); if (!pctx) - return 0; + return -1; if (EVP_PKEY_decrypt_init(pctx) <= 0) goto err; @@ -235,12 +235,19 @@ static int pkcs7_decrypt_rinfo(unsigned char **pek, int *peklen, if (EVP_PKEY_decrypt(pctx, ek, &eklen, ri->enc_key->data, ri->enc_key->length) <= 0) { + ret = 0; PKCS7err(PKCS7_F_PKCS7_DECRYPT_RINFO, ERR_R_EVP_LIB); goto err; } ret = 1; + if (*pek) + { + OPENSSL_cleanse(*pek, *peklen); + OPENSSL_free(*pek); + } + *pek = ek; *peklen = eklen; @@ -423,6 +430,8 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) STACK_OF(X509_ALGOR) *md_sk=NULL; STACK_OF(PKCS7_RECIP_INFO) *rsk=NULL; PKCS7_RECIP_INFO *ri=NULL; + unsigned char *ek = NULL, *tkey = NULL; + int eklen = 0, tkeylen = 0; i=OBJ_obj2nid(p7->type); p7->state=PKCS7_S_HEADER; @@ -500,8 +509,6 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) int max; X509_OBJECT ret; #endif - unsigned char *ek = NULL; - int eklen; if ((etmp=BIO_new(BIO_f_cipher())) == NULL) { @@ -534,29 +541,28 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) } /* If we haven't got a certificate try each ri in turn */ - if (pcert == NULL) { + /* Always attempt to decrypt all rinfo even + * after sucess as a defence against MMA timing + * attacks. + */ for (i=0; i 0) - break; + ri, pkey) < 0) + goto err; ERR_clear_error(); - ri = NULL; - } - if (ri == NULL) - { - PKCS7err(PKCS7_F_PKCS7_DATADECODE, - PKCS7_R_NO_RECIPIENT_MATCHES_KEY); - goto err; } } else { - if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) <= 0) + /* Only exit on fatal errors, not decrypt failure */ + if (pkcs7_decrypt_rinfo(&ek, &eklen, ri, pkey) < 0) goto err; + ERR_clear_error(); } evp_ctx=NULL; @@ -565,6 +571,19 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) goto err; if (EVP_CIPHER_asn1_to_param(evp_ctx,enc_alg->parameter) < 0) goto err; + /* Generate random key as MMA defence */ + tkeylen = EVP_CIPHER_CTX_key_length(evp_ctx); + tkey = OPENSSL_malloc(tkeylen); + if (!tkey) + goto err; + if (EVP_CIPHER_CTX_rand_key(evp_ctx, tkey) <= 0) + goto err; + if (ek == NULL) + { + ek = tkey; + eklen = tkeylen; + tkey = NULL; + } if (eklen != EVP_CIPHER_CTX_key_length(evp_ctx)) { /* Some S/MIME clients don't use the same key @@ -573,11 +592,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) */ if(!EVP_CIPHER_CTX_set_key_length(evp_ctx, eklen)) { - PKCS7err(PKCS7_F_PKCS7_DATADECODE, - PKCS7_R_DECRYPTED_KEY_IS_WRONG_LENGTH); - goto err; + /* Use random key as MMA defence */ + OPENSSL_cleanse(ek, eklen); + OPENSSL_free(ek); + ek = tkey; + eklen = tkeylen; + tkey = NULL; } } + /* Clear errors so we don't leak information useful in MMA */ + ERR_clear_error(); if (EVP_CipherInit_ex(evp_ctx,NULL,NULL,ek,NULL,0) <= 0) goto err; @@ -585,6 +609,13 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) { OPENSSL_cleanse(ek,eklen); OPENSSL_free(ek); + ek = NULL; + } + if (tkey) + { + OPENSSL_cleanse(tkey,tkeylen); + OPENSSL_free(tkey); + tkey = NULL; } if (out == NULL) @@ -627,6 +658,16 @@ BIO *PKCS7_dataDecode(PKCS7 *p7, EVP_PKEY *pkey, BIO *in_bio, X509 *pcert) if (0) { err: + if (ek) + { + OPENSSL_cleanse(ek,eklen); + OPENSSL_free(ek); + } + if (tkey) + { + OPENSSL_cleanse(tkey,tkeylen); + OPENSSL_free(tkey); + } if (out != NULL) BIO_free_all(out); if (btmp != NULL) BIO_free_all(btmp); if (etmp != NULL) BIO_free_all(etmp); @@ -676,7 +717,11 @@ static int do_pkcs7_signed_attrib(PKCS7_SIGNER_INFO *si, EVP_MD_CTX *mctx) } /* Add digest */ - EVP_DigestFinal_ex(mctx, md_data,&md_len); + if (!EVP_DigestFinal_ex(mctx, md_data,&md_len)) + { + PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_EVP_LIB); + return 0; + } if (!PKCS7_add1_attrib_digest(si, md_data, md_len)) { PKCS7err(PKCS7_F_DO_PKCS7_SIGNED_ATTRIB, ERR_R_MALLOC_FAILURE); @@ -784,7 +829,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio) /* We now have the EVP_MD_CTX, lets do the * signing. */ - EVP_MD_CTX_copy_ex(&ctx_tmp,mdc); + if (!EVP_MD_CTX_copy_ex(&ctx_tmp,mdc)) + goto err; sk=si->auth_attr; @@ -822,7 +868,8 @@ int PKCS7_dataFinal(PKCS7 *p7, BIO *bio) if (!PKCS7_find_digest(&mdc, bio, OBJ_obj2nid(p7->d.digest->md->algorithm))) goto err; - EVP_DigestFinal_ex(mdc,md_data,&md_len); + if (!EVP_DigestFinal_ex(mdc,md_data,&md_len)) + goto err; M_ASN1_OCTET_STRING_set(p7->d.digest->digest, md_data, md_len); } @@ -1015,7 +1062,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si, /* mdc is the digest ctx that we want, unless there are attributes, * in which case the digest is the signed attributes */ - EVP_MD_CTX_copy_ex(&mdc_tmp,mdc); + if (!EVP_MD_CTX_copy_ex(&mdc_tmp,mdc)) + goto err; sk=si->auth_attr; if ((sk != NULL) && (sk_X509_ATTRIBUTE_num(sk) != 0)) @@ -1025,7 +1073,8 @@ int PKCS7_signatureVerify(BIO *bio, PKCS7 *p7, PKCS7_SIGNER_INFO *si, int alen; ASN1_OCTET_STRING *message_digest; - EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len); + if (!EVP_DigestFinal_ex(&mdc_tmp,md_dat,&md_len)) + goto err; message_digest=PKCS7_digest_from_attributes(sk); if (!message_digest) { @@ -1050,7 +1099,8 @@ for (ii=0; ii 0 && BIO_method_type(tmpmem) == BIO_TYPE_CIPHER) + { + if (!BIO_get_cipher_status(tmpmem)) + ret = 0; + } BIO_free_all(bread); return ret; } else { for(;;) { i = BIO_read(tmpmem, buf, sizeof(buf)); - if(i <= 0) break; - BIO_write(data, buf, i); + if(i <= 0) + { + ret = 1; + if (BIO_method_type(tmpmem) == BIO_TYPE_CIPHER) + { + if (!BIO_get_cipher_status(tmpmem)) + ret = 0; + } + + break; + } + if (BIO_write(data, buf, i) != i) + { + ret = 0; + break; + } } BIO_free_all(tmpmem); - return 1; + return ret; } } diff --git a/src/lib/libcrypto/ppccap.c b/src/lib/libcrypto/ppccap.c new file mode 100644 index 0000000000..ab89ccaa12 --- /dev/null +++ b/src/lib/libcrypto/ppccap.c @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include +#include +#include + +#define PPC_FPU64 (1<<0) +#define PPC_ALTIVEC (1<<1) + +static int OPENSSL_ppccap_P = 0; + +static sigset_t all_masked; + +#ifdef OPENSSL_BN_ASM_MONT +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num) + { + int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); + int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num); + + if (sizeof(size_t)==4) + { +#if (defined(__APPLE__) && defined(__MACH__)) + if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) + return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); +#else + /* boundary of 32 was experimentally determined on + Linux 2.6.22, might have to be adjusted on AIX... */ + if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64)) + { + sigset_t oset; + int ret; + + sigprocmask(SIG_SETMASK,&all_masked,&oset); + ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); + sigprocmask(SIG_SETMASK,&oset,NULL); + + return ret; + } +#endif + } + else if ((OPENSSL_ppccap_P&PPC_FPU64)) + /* this is a "must" on POWER6, but run-time detection + * is not implemented yet... */ + return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num); + + return bn_mul_mont_int(rp,ap,bp,np,n0,num); + } +#endif + +static sigjmp_buf ill_jmp; +static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } + +void OPENSSL_ppc64_probe(void); + +void OPENSSL_cpuid_setup(void) + { + char *e; + struct sigaction ill_oact,ill_act; + sigset_t oset; + static int trigger=0; + + if (trigger) return; + trigger=1; + + sigfillset(&all_masked); + sigdelset(&all_masked,SIGILL); + sigdelset(&all_masked,SIGTRAP); +#ifdef SIGEMT + sigdelset(&all_masked,SIGEMT); +#endif + sigdelset(&all_masked,SIGFPE); + sigdelset(&all_masked,SIGBUS); + sigdelset(&all_masked,SIGSEGV); + + if ((e=getenv("OPENSSL_ppccap"))) + { + OPENSSL_ppccap_P=strtoul(e,NULL,0); + return; + } + + OPENSSL_ppccap_P = 0; + + memset(&ill_act,0,sizeof(ill_act)); + ill_act.sa_handler = ill_handler; + ill_act.sa_mask = all_masked; + + sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset); + sigaction(SIGILL,&ill_act,&ill_oact); + + if (sizeof(size_t)==4) + { + if (sigsetjmp(ill_jmp,1) == 0) + { + OPENSSL_ppc64_probe(); + OPENSSL_ppccap_P |= PPC_FPU64; + } + } + else + { + /* + * Wanted code detecting POWER6 CPU and setting PPC_FPU64 + */ + } + + if (sigsetjmp(ill_jmp,1) == 0) + { + OPENSSL_altivec_probe(); + OPENSSL_ppccap_P |= PPC_ALTIVEC; + } + + sigaction (SIGILL,&ill_oact,NULL); + sigprocmask(SIG_SETMASK,&oset,NULL); + } diff --git a/src/lib/libcrypto/ppccpuid.pl b/src/lib/libcrypto/ppccpuid.pl index 369e1d0df9..4ba736a1d1 100755 --- a/src/lib/libcrypto/ppccpuid.pl +++ b/src/lib/libcrypto/ppccpuid.pl @@ -23,36 +23,67 @@ $code=<<___; .machine "any" .text -.globl .OPENSSL_cpuid_setup +.globl .OPENSSL_ppc64_probe .align 4 -.OPENSSL_cpuid_setup: +.OPENSSL_ppc64_probe: + fcfid f1,f1 + extrdi r0,r0,32,0 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + +.globl .OPENSSL_altivec_probe +.align 4 +.OPENSSL_altivec_probe: + .long 0x10000484 # vor v0,v0,v0 + blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_wipe_cpu .align 4 .OPENSSL_wipe_cpu: xor r0,r0,r0 + fmr f0,f31 + fmr f1,f31 + fmr f2,f31 mr r3,r1 + fmr f3,f31 xor r4,r4,r4 + fmr f4,f31 xor r5,r5,r5 + fmr f5,f31 xor r6,r6,r6 + fmr f6,f31 xor r7,r7,r7 + fmr f7,f31 xor r8,r8,r8 + fmr f8,f31 xor r9,r9,r9 + fmr f9,f31 xor r10,r10,r10 + fmr f10,f31 xor r11,r11,r11 + fmr f11,f31 xor r12,r12,r12 + fmr f12,f31 + fmr f13,f31 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_atomic_add .align 4 .OPENSSL_atomic_add: -Loop: lwarx r5,0,r3 +Ladd: lwarx r5,0,r3 add r0,r4,r5 stwcx. r0,0,r3 - bne- Loop + bne- Ladd $SIGNX r3,r0 blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 .globl .OPENSSL_rdtsc .align 4 @@ -60,6 +91,8 @@ Loop: lwarx r5,0,r3 mftb r3 mftbu r4 blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 .globl .OPENSSL_cleanse .align 4 @@ -72,7 +105,7 @@ Loop: lwarx r5,0,r3 Little: mtctr r4 stb r0,0(r3) addi r3,r3,1 - bdnz- \$-8 + bdnz \$-8 blr Lot: andi. r5,r3,3 beq Laligned @@ -85,10 +118,13 @@ Laligned: mtctr r5 stw r0,0(r3) addi r3,r3,4 - bdnz- \$-8 + bdnz \$-8 andi. r4,r4,3 bne Little blr + .long 0 + .byte 0,12,0x14,0,0,0,2,0 + .long 0 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/src/lib/libcrypto/rand/rand.h b/src/lib/libcrypto/rand/rand.h index ac6c021763..dc8fcf94c5 100644 --- a/src/lib/libcrypto/rand/rand.h +++ b/src/lib/libcrypto/rand/rand.h @@ -119,6 +119,11 @@ int RAND_event(UINT, WPARAM, LPARAM); #endif +#ifdef OPENSSL_FIPS +void RAND_set_fips_drbg_type(int type, int flags); +int RAND_init_fips(void); +#endif + /* BEGIN ERROR CODES */ /* The following lines are auto generated by the script mkerr.pl. Any changes * made after this point may be overwritten when the script is next run. @@ -129,9 +134,13 @@ void ERR_load_RAND_strings(void); /* Function codes. */ #define RAND_F_RAND_GET_RAND_METHOD 101 +#define RAND_F_RAND_INIT_FIPS 102 #define RAND_F_SSLEAY_RAND_BYTES 100 /* Reason codes. */ +#define RAND_R_ERROR_INITIALISING_DRBG 102 +#define RAND_R_ERROR_INSTANTIATING_DRBG 103 +#define RAND_R_NO_FIPS_RANDOM_METHOD_SET 101 #define RAND_R_PRNG_NOT_SEEDED 100 #ifdef __cplusplus diff --git a/src/lib/libcrypto/rand/rand_err.c b/src/lib/libcrypto/rand/rand_err.c index 03cda4dd92..b8586c8f4a 100644 --- a/src/lib/libcrypto/rand/rand_err.c +++ b/src/lib/libcrypto/rand/rand_err.c @@ -1,6 +1,6 @@ /* crypto/rand/rand_err.c */ /* ==================================================================== - * Copyright (c) 1999-2006 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -71,12 +71,16 @@ static ERR_STRING_DATA RAND_str_functs[]= { {ERR_FUNC(RAND_F_RAND_GET_RAND_METHOD), "RAND_get_rand_method"}, +{ERR_FUNC(RAND_F_RAND_INIT_FIPS), "RAND_init_fips"}, {ERR_FUNC(RAND_F_SSLEAY_RAND_BYTES), "SSLEAY_RAND_BYTES"}, {0,NULL} }; static ERR_STRING_DATA RAND_str_reasons[]= { +{ERR_REASON(RAND_R_ERROR_INITIALISING_DRBG),"error initialising drbg"}, +{ERR_REASON(RAND_R_ERROR_INSTANTIATING_DRBG),"error instantiating drbg"}, +{ERR_REASON(RAND_R_NO_FIPS_RANDOM_METHOD_SET),"no fips random method set"}, {ERR_REASON(RAND_R_PRNG_NOT_SEEDED) ,"PRNG not seeded"}, {0,NULL} }; diff --git a/src/lib/libcrypto/rand/rand_lib.c b/src/lib/libcrypto/rand/rand_lib.c index 513e338985..daf1dab973 100644 --- a/src/lib/libcrypto/rand/rand_lib.c +++ b/src/lib/libcrypto/rand/rand_lib.c @@ -60,10 +60,16 @@ #include #include "cryptlib.h" #include + #ifndef OPENSSL_NO_ENGINE #include #endif +#ifdef OPENSSL_FIPS +#include +#include +#endif + #ifndef OPENSSL_NO_ENGINE /* non-NULL if default_RAND_meth is ENGINE-provided */ static ENGINE *funct_ref =NULL; @@ -174,3 +180,116 @@ int RAND_status(void) return meth->status(); return 0; } + +#ifdef OPENSSL_FIPS + +/* FIPS DRBG initialisation code. This sets up the DRBG for use by the + * rest of OpenSSL. + */ + +/* Entropy gatherer: use standard OpenSSL PRNG to seed (this will gather + * entropy internally through RAND_poll(). + */ + +static size_t drbg_get_entropy(DRBG_CTX *ctx, unsigned char **pout, + int entropy, size_t min_len, size_t max_len) + { + /* Round up request to multiple of block size */ + min_len = ((min_len + 19) / 20) * 20; + *pout = OPENSSL_malloc(min_len); + if (!*pout) + return 0; + if (RAND_SSLeay()->bytes(*pout, min_len) <= 0) + { + OPENSSL_free(*pout); + *pout = NULL; + return 0; + } + return min_len; + } + +static void drbg_free_entropy(DRBG_CTX *ctx, unsigned char *out, size_t olen) + { + OPENSSL_cleanse(out, olen); + OPENSSL_free(out); + } + +/* Set "additional input" when generating random data. This uses the + * current PID, a time value and a counter. + */ + +static size_t drbg_get_adin(DRBG_CTX *ctx, unsigned char **pout) + { + /* Use of static variables is OK as this happens under a lock */ + static unsigned char buf[16]; + static unsigned long counter; + FIPS_get_timevec(buf, &counter); + *pout = buf; + return sizeof(buf); + } + +/* RAND_add() and RAND_seed() pass through to OpenSSL PRNG so it is + * correctly seeded by RAND_poll(). + */ + +static int drbg_rand_add(DRBG_CTX *ctx, const void *in, int inlen, + double entropy) + { + RAND_SSLeay()->add(in, inlen, entropy); + return 1; + } + +static int drbg_rand_seed(DRBG_CTX *ctx, const void *in, int inlen) + { + RAND_SSLeay()->seed(in, inlen); + return 1; + } + +#ifndef OPENSSL_DRBG_DEFAULT_TYPE +#define OPENSSL_DRBG_DEFAULT_TYPE NID_aes_256_ctr +#endif +#ifndef OPENSSL_DRBG_DEFAULT_FLAGS +#define OPENSSL_DRBG_DEFAULT_FLAGS DRBG_FLAG_CTR_USE_DF +#endif + +static int fips_drbg_type = OPENSSL_DRBG_DEFAULT_TYPE; +static int fips_drbg_flags = OPENSSL_DRBG_DEFAULT_FLAGS; + +void RAND_set_fips_drbg_type(int type, int flags) + { + fips_drbg_type = type; + fips_drbg_flags = flags; + } + +int RAND_init_fips(void) + { + DRBG_CTX *dctx; + size_t plen; + unsigned char pers[32], *p; + dctx = FIPS_get_default_drbg(); + if (FIPS_drbg_init(dctx, fips_drbg_type, fips_drbg_flags) <= 0) + { + RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INITIALISING_DRBG); + return 0; + } + + FIPS_drbg_set_callbacks(dctx, + drbg_get_entropy, drbg_free_entropy, 20, + drbg_get_entropy, drbg_free_entropy); + FIPS_drbg_set_rand_callbacks(dctx, drbg_get_adin, 0, + drbg_rand_seed, drbg_rand_add); + /* Personalisation string: a string followed by date time vector */ + strcpy((char *)pers, "OpenSSL DRBG2.0"); + plen = drbg_get_adin(dctx, &p); + memcpy(pers + 16, p, plen); + + if (FIPS_drbg_instantiate(dctx, pers, sizeof(pers)) <= 0) + { + RANDerr(RAND_F_RAND_INIT_FIPS, RAND_R_ERROR_INSTANTIATING_DRBG); + return 0; + } + FIPS_rand_set_method(FIPS_drbg_method()); + return 1; + } + +#endif diff --git a/src/lib/libcrypto/rand/randfile.c b/src/lib/libcrypto/rand/randfile.c index bc7d9c5804..030e07f418 100644 --- a/src/lib/libcrypto/rand/randfile.c +++ b/src/lib/libcrypto/rand/randfile.c @@ -137,7 +137,7 @@ int RAND_load_file(const char *file, long bytes) in=fopen(file,"rb"); #endif if (in == NULL) goto err; -#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPNESSL_NO_POSIX_IO) +#if defined(S_IFBLK) && defined(S_IFCHR) && !defined(OPENSSL_NO_POSIX_IO) if (sb.st_mode & (S_IFBLK | S_IFCHR)) { /* this file is a device. we don't want read an infinite number * of bytes from a random device, nor do we want to use buffered diff --git a/src/lib/libcrypto/rc2/rc2.h b/src/lib/libcrypto/rc2/rc2.h index 34c8362317..e542ec94ff 100644 --- a/src/lib/libcrypto/rc2/rc2.h +++ b/src/lib/libcrypto/rc2/rc2.h @@ -79,7 +79,9 @@ typedef struct rc2_key_st RC2_INT data[64]; } RC2_KEY; - +#ifdef OPENSSL_FIPS +void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits); +#endif void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data,int bits); void RC2_ecb_encrypt(const unsigned char *in,unsigned char *out,RC2_KEY *key, int enc); diff --git a/src/lib/libcrypto/rc2/rc2_skey.c b/src/lib/libcrypto/rc2/rc2_skey.c index 0150b0e035..6668ac011f 100644 --- a/src/lib/libcrypto/rc2/rc2_skey.c +++ b/src/lib/libcrypto/rc2/rc2_skey.c @@ -56,6 +56,7 @@ * [including the GNU Public Licence.] */ +#include #include #include "rc2_locl.h" @@ -95,6 +96,13 @@ static const unsigned char key_table[256]={ * the same as specifying 1024 for the 'bits' parameter. Bsafe uses * a version where the bits parameter is the same as len*8 */ void RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits) +#ifdef OPENSSL_FIPS + { + fips_cipher_abort(RC2); + private_RC2_set_key(key, len, data, bits); + } +void private_RC2_set_key(RC2_KEY *key, int len, const unsigned char *data, int bits) +#endif { int i,j; unsigned char *k; diff --git a/src/lib/libcrypto/rc4/asm/rc4-586.pl b/src/lib/libcrypto/rc4/asm/rc4-586.pl index 38a44a70ef..5c9ac6ad28 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-586.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-586.pl @@ -28,6 +28,34 @@ # # +# May 2011 +# +# Optimize for Core2 and Westmere [and incidentally Opteron]. Current +# performance in cycles per processed byte (less is better) and +# improvement relative to previous version of this module is: +# +# Pentium 10.2 # original numbers +# Pentium III 7.8(*) +# Intel P4 7.5 +# +# Opteron 6.1/+20% # new MMX numbers +# Core2 5.3/+67%(**) +# Westmere 5.1/+94%(**) +# Sandy Bridge 5.0/+8% +# Atom 12.6/+6% +# +# (*) PIII can actually deliver 6.6 cycles per byte with MMX code, +# but this specific code performs poorly on Core2. And vice +# versa, below MMX/SSE code delivering 5.8/7.1 on Core2 performs +# poorly on PIII, at 8.0/14.5:-( As PIII is not a "hot" CPU +# [anymore], I chose to discard PIII-specific code path and opt +# for original IALU-only code, which is why MMX/SSE code path +# is guarded by SSE2 bit (see below), not MMX/SSE. +# (**) Performance vs. block size on Core2 and Westmere had a maximum +# at ... 64 bytes block size. And it was quite a maximum, 40-60% +# in comparison to largest 8KB block size. Above improvement +# coefficients are for the largest block size. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; @@ -62,6 +90,68 @@ sub RC4_loop { &$func ($out,&DWP(0,$dat,$ty,4)); } +if ($alt=0) { + # >20% faster on Atom and Sandy Bridge[!], 8% faster on Opteron, + # but ~40% slower on Core2 and Westmere... Attempt to add movz + # brings down Opteron by 25%, Atom and Sandy Bridge by 15%, yet + # on Core2 with movz it's almost 20% slower than below alternative + # code... Yes, it's a total mess... + my @XX=($xx,$out); + $RC4_loop_mmx = sub { # SSE actually... + my $i=shift; + my $j=$i<=0?0:$i>>1; + my $mm=$i<=0?"mm0":"mm".($i&1); + + &add (&LB($yy),&LB($tx)); + &lea (@XX[1],&DWP(1,@XX[0])); + &pxor ("mm2","mm0") if ($i==0); + &psllq ("mm1",8) if ($i==0); + &and (@XX[1],0xff); + &pxor ("mm0","mm0") if ($i<=0); + &mov ($ty,&DWP(0,$dat,$yy,4)); + &mov (&DWP(0,$dat,$yy,4),$tx); + &pxor ("mm1","mm2") if ($i==0); + &mov (&DWP(0,$dat,$XX[0],4),$ty); + &add (&LB($ty),&LB($tx)); + &movd (@XX[0],"mm7") if ($i==0); + &mov ($tx,&DWP(0,$dat,@XX[1],4)); + &pxor ("mm1","mm1") if ($i==1); + &movq ("mm2",&QWP(0,$inp)) if ($i==1); + &movq (&QWP(-8,(@XX[0],$inp)),"mm1") if ($i==0); + &pinsrw ($mm,&DWP(0,$dat,$ty,4),$j); + + push (@XX,shift(@XX)) if ($i>=0); + } +} else { + # Using pinsrw here improves performane on Intel CPUs by 2-3%, but + # brings down AMD by 7%... + $RC4_loop_mmx = sub { + my $i=shift; + + &add (&LB($yy),&LB($tx)); + &psllq ("mm1",8*(($i-1)&7)) if (abs($i)!=1); + &mov ($ty,&DWP(0,$dat,$yy,4)); + &mov (&DWP(0,$dat,$yy,4),$tx); + &mov (&DWP(0,$dat,$xx,4),$ty); + &inc ($xx); + &add ($ty,$tx); + &movz ($xx,&LB($xx)); # (*) + &movz ($ty,&LB($ty)); # (*) + &pxor ("mm2",$i==1?"mm0":"mm1") if ($i>=0); + &movq ("mm0",&QWP(0,$inp)) if ($i<=0); + &movq (&QWP(-8,($out,$inp)),"mm2") if ($i==0); + &mov ($tx,&DWP(0,$dat,$xx,4)); + &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); + + # (*) This is the key to Core2 and Westmere performance. + # Whithout movz out-of-order execution logic confuses + # itself and fails to reorder loads and stores. Problem + # appears to be fixed in Sandy Bridge... + } +} + +&external_label("OPENSSL_ia32cap_P"); + # void RC4(RC4_KEY *key,size_t len,const unsigned char *inp,unsigned char *out); &function_begin("RC4"); &mov ($dat,&wparam(0)); # load key schedule pointer @@ -94,11 +184,56 @@ sub RC4_loop { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); + &test ($ty,-8); + &mov (&wparam(3),$out); # $out as accumulator in these loops + &jz (&label("go4loop4")); + + &picmeup($out,"OPENSSL_ia32cap_P"); + &bt (&DWP(0,$out),26); # check SSE2 bit [could have been MMX] + &jnc (&label("go4loop4")); + + &mov ($out,&wparam(3)) if (!$alt); + &movd ("mm7",&wparam(3)) if ($alt); + &and ($ty,-8); + &lea ($ty,&DWP(-8,$inp,$ty)); + &mov (&DWP(-4,$dat),$ty); # save input+(len/8)*8-8 + + &$RC4_loop_mmx(-1); + &jmp(&label("loop_mmx_enter")); + + &set_label("loop_mmx",16); + &$RC4_loop_mmx(0); + &set_label("loop_mmx_enter"); + for ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); } + &mov ($ty,$yy); + &xor ($yy,$yy); # this is second key to Core2 + &mov (&LB($yy),&LB($ty)); # and Westmere performance... + &cmp ($inp,&DWP(-4,$dat)); + &lea ($inp,&DWP(8,$inp)); + &jb (&label("loop_mmx")); + + if ($alt) { + &movd ($out,"mm7"); + &pxor ("mm2","mm0"); + &psllq ("mm1",8); + &pxor ("mm1","mm2"); + &movq (&QWP(-8,$out,$inp),"mm1"); + } else { + &psllq ("mm1",56); + &pxor ("mm2","mm1"); + &movq (&QWP(-8,$out,$inp),"mm2"); + } + &emms (); + + &cmp ($inp,&wparam(1)); # compare to input+len + &je (&label("done")); + &jmp (&label("loop1")); + +&set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); &mov (&wparam(2),$ty); # save input+(len/4)*4-4 - &mov (&wparam(3),$out); # $out as accumulator in this loop - &set_label("loop4",16); + &set_label("loop4"); for ($i=0;$i<4;$i++) { RC4_loop($i); } &ror ($out,8); &xor ($out,&DWP(0,$inp)); @@ -151,7 +286,7 @@ sub RC4_loop { &set_label("done"); &dec (&LB($xx)); - &mov (&BP(-4,$dat),&LB($yy)); # save key->y + &mov (&DWP(-4,$dat),$yy); # save key->y &mov (&BP(-8,$dat),&LB($xx)); # save key->x &set_label("abort"); &function_end("RC4"); @@ -164,10 +299,8 @@ $idi="ebp"; $ido="ecx"; $idx="edx"; -&external_label("OPENSSL_ia32cap_P"); - # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data); -&function_begin("RC4_set_key"); +&function_begin("private_RC4_set_key"); &mov ($out,&wparam(0)); # load key &mov ($idi,&wparam(1)); # load len &mov ($inp,&wparam(2)); # load data @@ -245,7 +378,7 @@ $idx="edx"; &xor ("eax","eax"); &mov (&DWP(-8,$out),"eax"); # key->x=0; &mov (&DWP(-4,$out),"eax"); # key->y=0; -&function_end("RC4_set_key"); +&function_end("private_RC4_set_key"); # const char *RC4_options(void); &function_begin_B("RC4_options"); @@ -254,14 +387,21 @@ $idx="edx"; &blindpop("eax"); &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax")); &picmeup("edx","OPENSSL_ia32cap_P"); - &bt (&DWP(0,"edx"),20); - &jnc (&label("skip")); - &add ("eax",12); - &set_label("skip"); + &mov ("edx",&DWP(0,"edx")); + &bt ("edx",20); + &jc (&label("1xchar")); + &bt ("edx",26); + &jnc (&label("ret")); + &add ("eax",25); + &ret (); +&set_label("1xchar"); + &add ("eax",12); +&set_label("ret"); &ret (); &set_label("opts",64); &asciz ("rc4(4x,int)"); &asciz ("rc4(1x,char)"); +&asciz ("rc4(8x,mmx)"); &asciz ("RC4 for x86, CRYPTOGAMS by "); &align (64); &function_end_B("RC4_options"); diff --git a/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl new file mode 100644 index 0000000000..7f684092d4 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-md5-x86_64.pl @@ -0,0 +1,631 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# June 2011 +# +# This is RC4+MD5 "stitch" implementation. The idea, as spelled in +# http://download.intel.com/design/intarch/papers/323686.pdf, is that +# since both algorithms exhibit instruction-level parallelism, ILP, +# below theoretical maximum, interleaving them would allow to utilize +# processor resources better and achieve better performance. RC4 +# instruction sequence is virtually identical to rc4-x86_64.pl, which +# is heavily based on submission by Maxim Perminov, Maxim Locktyukhin +# and Jim Guilford of Intel. MD5 is fresh implementation aiming to +# minimize register usage, which was used as "main thread" with RC4 +# weaved into it, one RC4 round per one MD5 round. In addition to the +# stiched subroutine the script can generate standalone replacement +# md5_block_asm_data_order and RC4. Below are performance numbers in +# cycles per processed byte, less is better, for these the standalone +# subroutines, sum of them, and stitched one: +# +# RC4 MD5 RC4+MD5 stitch gain +# Opteron 6.5(*) 5.4 11.9 7.0 +70%(*) +# Core2 6.5 5.8 12.3 7.7 +60% +# Westmere 4.3 5.2 9.5 7.0 +36% +# Sandy Bridge 4.2 5.5 9.7 6.8 +43% +# Atom 9.3 6.5 15.8 11.1 +42% +# +# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement +# is +53%... + +my ($rc4,$md5)=(1,1); # what to generate? +my $D="#" if (!$md5); # if set to "#", MD5 is stitched into RC4(), + # but its result is discarded. Idea here is + # to be able to use 'openssl speed rc4' for + # benchmarking the stitched subroutine... + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; my $xlate; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +my ($dat,$in0,$out,$ctx,$inp,$len, $func,$nargs); + +if ($rc4 && !$md5) { + ($dat,$len,$in0,$out) = ("%rdi","%rsi","%rdx","%rcx"); + $func="RC4"; $nargs=4; +} elsif ($md5 && !$rc4) { + ($ctx,$inp,$len) = ("%rdi","%rsi","%rdx"); + $func="md5_block_asm_data_order"; $nargs=3; +} else { + ($dat,$in0,$out,$ctx,$inp,$len) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + $func="rc4_md5_enc"; $nargs=6; + # void rc4_md5_enc( + # RC4_KEY *key, # + # const void *in0, # RC4 input + # void *out, # RC4 output + # MD5_CTX *ctx, # + # const void *inp, # MD5 input + # size_t len); # number of 64-byte blocks +} + +my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee, + 0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501, + 0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be, + 0x6b901122,0xfd987193,0xa679438e,0x49b40821, + + 0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa, + 0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8, + 0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed, + 0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a, + + 0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c, + 0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70, + 0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05, + 0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665, + + 0xf4292244,0x432aff97,0xab9423a7,0xfc93a039, + 0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1, + 0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1, + 0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391 ); + +my @V=("%r8d","%r9d","%r10d","%r11d"); # MD5 registers +my $tmp="%r12d"; + +my @XX=("%rbp","%rsi"); # RC4 registers +my @TX=("%rax","%rbx"); +my $YY="%rcx"; +my $TY="%rdx"; + +my $MOD=32; # 16, 32 or 64 + +$code.=<<___; +.text +.align 16 + +.globl $func +.type $func,\@function,$nargs +$func: + cmp \$0,$len + je .Labort + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + sub \$40,%rsp +.Lbody: +___ +if ($rc4) { +$code.=<<___; +$D#md5# mov $ctx,%r11 # reassign arguments + mov $len,%r12 + mov $in0,%r13 + mov $out,%r14 +$D#md5# mov $inp,%r15 +___ + $ctx="%r11" if ($md5); # reassign arguments + $len="%r12"; + $in0="%r13"; + $out="%r14"; + $inp="%r15" if ($md5); + $inp=$in0 if (!$md5); +$code.=<<___; + xor $XX[0],$XX[0] + xor $YY,$YY + + lea 8($dat),$dat + mov -8($dat),$XX[0]#b + mov -4($dat),$YY#b + + inc $XX[0]#b + sub $in0,$out + movl ($dat,$XX[0],4),$TX[0]#d +___ +$code.=<<___ if (!$md5); + xor $TX[1],$TX[1] + test \$-128,$len + jz .Loop1 + sub $XX[0],$TX[1] + and \$`$MOD-1`,$TX[1] + jz .Loop${MOD}_is_hot + sub $TX[1],$len +.Loop${MOD}_warmup: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($in0),$TY#b + movb $TY#b,($out,$in0) + lea 1($in0),$in0 + dec $TX[1] + jnz .Loop${MOD}_warmup + + mov $YY,$TX[1] + xor $YY,$YY + mov $TX[1]#b,$YY#b + +.Loop${MOD}_is_hot: + mov $len,32(%rsp) # save original $len + shr \$6,$len # number of 64-byte blocks +___ + if ($D && !$md5) { # stitch in dummy MD5 + $md5=1; + $ctx="%r11"; + $inp="%r15"; + $code.=<<___; + mov %rsp,$ctx + mov $in0,$inp +___ + } +} +$code.=<<___; +#rc4# add $TX[0]#b,$YY#b +#rc4# lea ($dat,$XX[0],4),$XX[1] + shl \$6,$len + add $inp,$len # pointer to the end of input + mov $len,16(%rsp) + +#md5# mov $ctx,24(%rsp) # save pointer to MD5_CTX +#md5# mov 0*4($ctx),$V[0] # load current hash value from MD5_CTX +#md5# mov 1*4($ctx),$V[1] +#md5# mov 2*4($ctx),$V[2] +#md5# mov 3*4($ctx),$V[3] + jmp .Loop + +.align 16 +.Loop: +#md5# mov $V[0],0*4(%rsp) # put aside current hash value +#md5# mov $V[1],1*4(%rsp) +#md5# mov $V[2],2*4(%rsp) +#md5# mov $V[3],$tmp # forward reference +#md5# mov $V[3],3*4(%rsp) +___ + +sub R0 { + my ($i,$a,$b,$c,$d)=@_; + my @rot0=(7,12,17,22); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu ($in0),%xmm2\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $c,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# and $b,$tmp +#md5# add 4*`$j`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#md5# xor $d,$tmp +#rc4# movz $TX[0]#b,$TX[0]#d +#rc4# movl $TY#d,4*$k($XX[1]) +#md5# add $tmp,$a +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot0[$j%4],$a +#md5# mov `$j==15?"$b":"$c"`,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] +___ + $code.=<<___ if ($rc4 && $j==15); + psllq \$8,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm1,%xmm2 +___ +} +sub R1 { + my ($i,$a,$b,$c,$d)=@_; + my @rot1=(5,9,14,20); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu 16($in0),%xmm3\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $b,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# and $d,$tmp +#md5# add 4*`((1+5*$j)%16)`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#md5# xor $c,$tmp +#rc4# movz $TX[0]#b,$TX[0]#d +#rc4# movl $TY#d,4*$k($XX[1]) +#md5# add $tmp,$a +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot1[$j%4],$a +#md5# mov `$j==15?"$c":"$b"`,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] +___ + $code.=<<___ if ($rc4 && $j==15); + psllq \$8,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 +___ +} +sub R2 { + my ($i,$a,$b,$c,$d)=@_; + my @rot2=(4,11,16,23); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu 32($in0),%xmm4\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $c,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# xor $b,$tmp +#md5# add 4*`((5+3*$j)%16)`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#rc4# movz $TX[0]#b,$TX[0]#d +#md5# add $tmp,$a +#rc4# movl $TY#d,4*$k($XX[1]) +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot2[$j%4],$a +#md5# mov `$j==15?"\\\$-1":"$c"`,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15 && $k==$MOD-1); + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] +___ + $code.=<<___ if ($rc4 && $j==15); + psllq \$8,%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm4 +___ +} +sub R3 { + my ($i,$a,$b,$c,$d)=@_; + my @rot3=(6,10,15,21); + my $j=$i%16; + my $k=$i%$MOD; + my $xmm="%xmm".($j&1); + $code.=" movdqu 48($in0),%xmm5\n" if ($rc4 && $j==15); + $code.=" add \$$MOD,$XX[0]#b\n" if ($rc4 && $j==15 && $k==$MOD-1); + $code.=" pxor $xmm,$xmm\n" if ($rc4 && $j<=1); + $code.=<<___; +#rc4# movl ($dat,$YY,4),$TY#d +#md5# xor $d,$tmp +#rc4# movl $TX[0]#d,($dat,$YY,4) +#md5# or $b,$tmp +#md5# add 4*`((7*$j)%16)`($inp),$a +#rc4# add $TY#b,$TX[0]#b +#rc4# movl `4*(($k+1)%$MOD)`(`$k==$MOD-1?"$dat,$XX[0],4":"$XX[1]"`),$TX[1]#d +#md5# add \$$K[$i],$a +#rc4# movz $TX[0]#b,$TX[0]#d +#md5# xor $c,$tmp +#rc4# movl $TY#d,4*$k($XX[1]) +#md5# add $tmp,$a +#rc4# add $TX[1]#b,$YY#b +#md5# rol \$$rot3[$j%4],$a +#md5# mov \$-1,$tmp # forward reference +#rc4# pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n +#md5# add $b,$a +___ + $code.=<<___ if ($rc4 && $j==15); + mov $XX[0],$XX[1] + xor $XX[0],$XX[0] # keyword to partial register + mov $XX[1]#b,$XX[0]#b + mov $YY,$XX[1] + xor $YY,$YY # keyword to partial register + mov $XX[1]#b,$YY#b + lea ($dat,$XX[0],4),$XX[1] + psllq \$8,%xmm1 + pxor %xmm0,%xmm5 + pxor %xmm1,%xmm5 +___ +} + +my $i=0; +for(;$i<16;$i++) { R0($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } +for(;$i<32;$i++) { R1($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } +for(;$i<48;$i++) { R2($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } +for(;$i<64;$i++) { R3($i,@V); unshift(@V,pop(@V)); push(@TX,shift(@TX)); } + +$code.=<<___; +#md5# add 0*4(%rsp),$V[0] # accumulate hash value +#md5# add 1*4(%rsp),$V[1] +#md5# add 2*4(%rsp),$V[2] +#md5# add 3*4(%rsp),$V[3] + +#rc4# movdqu %xmm2,($out,$in0) # write RC4 output +#rc4# movdqu %xmm3,16($out,$in0) +#rc4# movdqu %xmm4,32($out,$in0) +#rc4# movdqu %xmm5,48($out,$in0) +#md5# lea 64($inp),$inp +#rc4# lea 64($in0),$in0 + cmp 16(%rsp),$inp # are we done? + jb .Loop + +#md5# mov 24(%rsp),$len # restore pointer to MD5_CTX +#rc4# sub $TX[0]#b,$YY#b # correct $YY +#md5# mov $V[0],0*4($len) # write MD5_CTX +#md5# mov $V[1],1*4($len) +#md5# mov $V[2],2*4($len) +#md5# mov $V[3],3*4($len) +___ +$code.=<<___ if ($rc4 && (!$md5 || $D)); + mov 32(%rsp),$len # restore original $len + and \$63,$len # remaining bytes + jnz .Loop1 + jmp .Ldone + +.align 16 +.Loop1: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($in0),$TY#b + movb $TY#b,($out,$in0) + lea 1($in0),$in0 + dec $len + jnz .Loop1 + +.Ldone: +___ +$code.=<<___; +#rc4# sub \$1,$XX[0]#b +#rc4# movl $XX[0]#d,-8($dat) +#rc4# movl $YY#d,-4($dat) + + mov 40(%rsp),%r15 + mov 48(%rsp),%r14 + mov 56(%rsp),%r13 + mov 64(%rsp),%r12 + mov 72(%rsp),%rbp + mov 80(%rsp),%rbx + lea 88(%rsp),%rsp +.Lepilogue: +.Labort: + ret +.size $func,.-$func +___ + +if ($rc4 && $D) { # sole purpose of this section is to provide + # option to use the generated module as drop-in + # replacement for rc4-x86_64.pl for debugging + # and testing purposes... +my ($idx,$ido)=("%r8","%r9"); +my ($dat,$len,$inp)=("%rdi","%rsi","%rdx"); + +$code.=<<___; +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + jmp .Lw1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@abi-omnipotent +.align 16 +RC4_options: + lea .Lopts(%rip),%rax + ret +.align 64 +.Lopts: +.asciz "rc4(64x,int)" +.align 64 +.size RC4_options,.-RC4_options +___ +} +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +my $rec="%rcx"; +my $frame="%rdx"; +my $context="%r8"; +my $disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + lea .Lbody(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lbody + jb .Lin_prologue + + mov 152($context),%rax # pull context->Rsp + + lea .Lepilogue(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + mov 40(%rax),%r15 + mov 48(%rax),%r14 + mov 56(%rax),%r13 + mov 64(%rax),%r12 + mov 72(%rax),%rbp + mov 80(%rax),%rbx + lea 88(%rax),%rax + + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R12 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R15 + +.Lin_prologue: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_$func + .rva .LSEH_end_$func + .rva .LSEH_info_$func + +.section .xdata +.align 8 +.LSEH_info_$func: + .byte 9,0,0,0 + .rva se_handler +___ +} + +sub reg_part { +my ($reg,$conv)=@_; + if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } + elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } + elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } + elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } + return $reg; +} + +$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/pinsrw\s+\$0,/movd /gm; + +$code =~ s/#md5#//gm if ($md5); +$code =~ s/#rc4#//gm if ($rc4); + +print $code; + +close STDOUT; diff --git a/src/lib/libcrypto/rc4/asm/rc4-parisc.pl b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl new file mode 100644 index 0000000000..9165067080 --- /dev/null +++ b/src/lib/libcrypto/rc4/asm/rc4-parisc.pl @@ -0,0 +1,313 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# RC4 for PA-RISC. + +# June 2009. +# +# Performance is 33% better than gcc 3.2 generated code on PA-7100LC. +# For reference, [4x] unrolled loop is >40% faster than folded one. +# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement +# is believed to be not sufficient to justify the effort... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker + # [+ argument transfer] +$SZ=1; # defaults to RC4_CHAR +if (open CONF,"<${dir}../../opensslconf.h") { + while() { + if (m/#\s*define\s+RC4_INT\s+(.*)/) { + $SZ = ($1=~/char$/) ? 1 : 4; + last; + } + } + close CONF; +} + +if ($SZ==1) { # RC4_CHAR + $LD="ldb"; + $LDX="ldbx"; + $MKX="addl"; + $ST="stb"; +} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) + $LD="ldw"; + $LDX="ldwx,s"; + $MKX="sh2addl"; + $ST="stw"; +} + +$key="%r26"; +$len="%r25"; +$inp="%r24"; +$out="%r23"; + +@XX=("%r19","%r20"); +@TX=("%r21","%r22"); +$YY="%r28"; +$TY="%r29"; + +$acc="%r1"; +$ix="%r2"; +$iy="%r3"; +$dat0="%r4"; +$dat1="%r5"; +$rem="%r6"; +$mask="%r31"; + +sub unrolledloopbody { +for ($i=0;$i<4;$i++) { +$code.=<<___; + ldo 1($XX[0]),$XX[1] + `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` + and $mask,$XX[1],$XX[1] + $LDX $YY($key),$TY + $MKX $YY,$key,$ix + $LDX $XX[1]($key),$TX[1] + $MKX $XX[0],$key,$iy + $ST $TX[0],0($ix) + comclr,<> $XX[1],$YY,%r0 ; conditional + copy $TX[0],$TX[1] ; move + `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` + $ST $TY,0($iy) + addl $TX[0],$TY,$TY + addl $TX[1],$YY,$YY + and $mask,$TY,$TY + and $mask,$YY,$YY +___ +push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +} } + +sub foldedloop { +my ($label,$count)=@_; +$code.=<<___; +$label + $MKX $YY,$key,$iy + $LDX $YY($key),$TY + $MKX $XX[0],$key,$ix + $ST $TX[0],0($iy) + ldo 1($XX[0]),$XX[0] + $ST $TY,0($ix) + addl $TX[0],$TY,$TY + ldbx $inp($out),$dat1 + and $mask,$TY,$TY + and $mask,$XX[0],$XX[0] + $LDX $TY($key),$acc + $LDX $XX[0]($key),$TX[0] + ldo 1($out),$out + xor $dat1,$acc,$acc + addl $TX[0],$YY,$YY + stb $acc,-1($out) + addib,<> -1,$count,$label ; $count is always small + and $mask,$YY,$YY +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR +RC4 + .PROC + .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + + cmpib,*= 0,$len,L\$abort + sub $inp,$out,$inp ; distance between $inp and $out + + $LD `0*$SZ`($key),$XX[0] + $LD `1*$SZ`($key),$YY + ldo `2*$SZ`($key),$key + + ldi 0xff,$mask + ldi 3,$dat0 + + ldo 1($XX[0]),$XX[0] ; warm up loop + and $mask,$XX[0],$XX[0] + $LDX $XX[0]($key),$TX[0] + addl $TX[0],$YY,$YY + cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? + and $mask,$YY,$YY + + and,<> $out,$dat0,$rem ; is $out aligned? + b L\$alignedout + subi 4,$rem,$rem + sub $len,$rem,$len +___ +&foldedloop("L\$alignout",$rem); # process till $out is aligned + +$code.=<<___; +L\$alignedout ; $len is at least 4 here + and,<> $inp,$dat0,$acc ; is $inp aligned? + b L\$oop4 + sub $inp,$acc,$rem ; align $inp + + sh3addl $acc,%r0,$acc + subi 32,$acc,$acc + mtctl $acc,%cr11 ; load %sar with vshd align factor + ldwx $rem($out),$dat0 + ldo 4($rem),$rem +L\$oop4misalignedinp +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $rem($out),$dat1 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + vshd $dat0,$dat1,$iy ; align data + copy $dat1,$dat0 + xor $iy,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4misalignedinp + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop + b L\$oop1 + nop + + .ALIGN 8 +L\$oop4 +___ +&unrolledloopbody(); +$code.=<<___; + $LDX $TY($key),$ix + ldwx $inp($out),$dat0 + ldo -4($len),$len + or $ix,$acc,$acc ; last piece, no need to dep + xor $dat0,$acc,$acc + stw $acc,0($out) + cmpib,*<< 3,$len,L\$oop4 + ldo 4($out),$out + cmpib,*= 0,$len,L\$done + nop +___ +&foldedloop("L\$oop1",$len); +$code.=<<___; +L\$done + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 + ldo -1($XX[0]),$XX[0] ; chill out loop + sub $YY,$TX[0],$YY + and $mask,$XX[0],$XX[0] + and $mask,$YY,$YY + $ST $XX[0],`-2*$SZ`($key) + $ST $YY,`-1*$SZ`($key) + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 +L\$abort + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND +___ + +$code.=<<___; + + .EXPORT private_RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 8 +private_RC4_set_key + .PROC + .CALLINFO NO_CALLS + .ENTRY + $ST %r0,`0*$SZ`($key) + $ST %r0,`1*$SZ`($key) + ldo `2*$SZ`($key),$key + copy %r0,@XX[0] +L\$1st + $ST @XX[0],0($key) + ldo 1(@XX[0]),@XX[0] + bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 + ldo $SZ($key),$key + + ldo `-256*$SZ`($key),$key ; rewind $key + addl $len,$inp,$inp ; $inp to point at the end + sub %r0,$len,%r23 ; inverse index + copy %r0,@XX[0] + copy %r0,@XX[1] + ldi 0xff,$mask + +L\$2nd + $LDX @XX[0]($key),@TX[0] + ldbx %r23($inp),@TX[1] + addi,nuv 1,%r23,%r23 ; increment and conditional + sub %r0,$len,%r23 ; inverse index + addl @TX[0],@XX[1],@XX[1] + addl @TX[1],@XX[1],@XX[1] + and $mask,@XX[1],@XX[1] + $MKX @XX[0],$key,$TY + $LDX @XX[1]($key),@TX[1] + $MKX @XX[1],$key,$YY + ldo 1(@XX[0]),@XX[0] + $ST @TX[0],0($YY) + bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 + $ST @TX[1],0($TY) + + bv,n (%r2) + .EXIT + nop + .PROCEND + + .EXPORT RC4_options,ENTRY + .ALIGN 8 +RC4_options + .PROC + .CALLINFO NO_CALLS + .ENTRY + blr %r0,%r28 + ldi 3,%r1 +L\$pic + andcm %r28,%r1,%r28 + bv (%r2) + .EXIT + ldo L\$opts-L\$pic(%r28),%r28 + .PROCEND + .ALIGN 8 +L\$opts + .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" + .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by " +___ +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4); + +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl index 96681fa05e..7528ece13c 100644 --- a/src/lib/libcrypto/rc4/asm/rc4-s390x.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-s390x.pl @@ -13,6 +13,29 @@ # "cluster" Address Generation Interlocks, so that one pipeline stall # resolves several dependencies. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z990 it was measured to perform +# 50% better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} +open STDOUT,">$output"; + $rp="%r14"; $sp="%r15"; $code=<<___; @@ -39,7 +62,12 @@ $code.=<<___; .type RC4,\@function .align 64 RC4: - stmg %r6,%r11,48($sp) + stm${g} %r6,%r11,6*$SIZE_T($sp) +___ +$code.=<<___ if ($flavour =~ /3[12]/); + llgfr $len,$len +___ +$code.=<<___; llgc $XX[0],0($key) llgc $YY,1($key) la $XX[0],1($XX[0]) @@ -90,7 +118,7 @@ $code.=<<___; xgr $acc,$TX[1] stg $acc,0($out) la $out,8($out) - brct $cnt,.Loop8 + brctg $cnt,.Loop8 .Lshort: lghi $acc,7 @@ -122,7 +150,7 @@ $code.=<<___; ahi $XX[0],-1 stc $XX[0],0($key) stc $YY,1($key) - lmg %r6,%r11,48($sp) + lm${g} %r6,%r11,6*$SIZE_T($sp) br $rp .size RC4,.-RC4 .string "RC4 for s390x, CRYPTOGAMS by " @@ -143,11 +171,11 @@ $ikey="%r7"; $iinp="%r8"; $code.=<<___; -.globl RC4_set_key -.type RC4_set_key,\@function +.globl private_RC4_set_key +.type private_RC4_set_key,\@function .align 64 -RC4_set_key: - stmg %r6,%r8,48($sp) +private_RC4_set_key: + stm${g} %r6,%r8,6*$SIZE_T($sp) lhi $cnt,256 la $idx,0(%r0) sth $idx,0($key) @@ -180,9 +208,9 @@ RC4_set_key: la $iinp,0(%r0) j .L2ndloop .Ldone: - lmg %r6,%r8,48($sp) + lm${g} %r6,%r8,6*$SIZE_T($sp) br $rp -.size RC4_set_key,.-RC4_set_key +.size private_RC4_set_key,.-private_RC4_set_key ___ } @@ -203,3 +231,4 @@ RC4_options: ___ print $code; +close STDOUT; # force flush diff --git a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl index 677be5fe25..d6eac205e9 100755 --- a/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl +++ b/src/lib/libcrypto/rc4/asm/rc4-x86_64.pl @@ -7,6 +7,8 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # +# July 2004 +# # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in # "hand-coded assembler"] doesn't stand for the whole improvement # coefficient. It turned out that eliminating RC4_CHAR from config @@ -19,6 +21,8 @@ # to operate on partial registers, it turned out to be the best bet. # At least for AMD... How IA32E would perform remains to be seen... +# November 2004 +# # As was shown by Marc Bevand reordering of couple of load operations # results in even higher performance gain of 3.3x:-) At least on # Opteron... For reference, 1x in this case is RC4_CHAR C-code @@ -26,6 +30,8 @@ # Latter means that if you want to *estimate* what to expect from # *your* Opteron, then multiply 54 by 3.3 and clock frequency in GHz. +# November 2004 +# # Intel P4 EM64T core was found to run the AMD64 code really slow... # The only way to achieve comparable performance on P4 was to keep # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to @@ -33,10 +39,14 @@ # on either AMD and Intel platforms, I implement both cases. See # rc4_skey.c for further details... +# April 2005 +# # P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing # those with add/sub results in 50% performance improvement of folded # loop... +# May 2005 +# # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T # performance by >30% [unlike P4 32-bit case that is]. But this is # provided that loads are reordered even more aggressively! Both code @@ -50,6 +60,8 @@ # is not implemented, then this final RC4_CHAR code-path should be # preferred, as it provides better *all-round* performance]. +# March 2007 +# # Intel Core2 was observed to perform poorly on both code paths:-( It # apparently suffers from some kind of partial register stall, which # occurs in 64-bit mode only [as virtually identical 32-bit loop was @@ -58,6 +70,37 @@ # fit for Core2 and therefore the code was modified to skip cloop8 on # this CPU. +# May 2010 +# +# Intel Westmere was observed to perform suboptimally. Adding yet +# another movzb to cloop1 improved performance by almost 50%! Core2 +# performance is improved too, but nominally... + +# May 2011 +# +# The only code path that was not modified is P4-specific one. Non-P4 +# Intel code path optimization is heavily based on submission by Maxim +# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used +# some of the ideas even in attempt to optmize the original RC4_INT +# code path... Current performance in cycles per processed byte (less +# is better) and improvement coefficients relative to previous +# version of this module are: +# +# Opteron 5.3/+0%(*) +# P4 6.5 +# Core2 6.2/+15%(**) +# Westmere 4.2/+60% +# Sandy Bridge 4.2/+120% +# Atom 9.3/+80% +# +# (*) But corresponding loop has less instructions, which should have +# positive effect on upcoming Bulldozer, which has one less ALU. +# For reference, Intel code runs at 6.8 cpb rate on Opteron. +# (**) Note that Core2 result is ~15% lower than corresponding result +# for 32-bit code, meaning that it's possible to improve it, +# but more than likely at the cost of the others (see rc4-586.pl +# to get the idea)... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -76,13 +119,10 @@ $len="%rsi"; # arg2 $inp="%rdx"; # arg3 $out="%rcx"; # arg4 -@XX=("%r8","%r10"); -@TX=("%r9","%r11"); -$YY="%r12"; -$TY="%r13"; - +{ $code=<<___; .text +.extern OPENSSL_ia32cap_P .globl RC4 .type RC4,\@function,4 @@ -95,48 +135,173 @@ RC4: or $len,$len push %r12 push %r13 .Lprologue: + mov $len,%r11 + mov $inp,%r12 + mov $out,%r13 +___ +my $len="%r11"; # reassign input arguments +my $inp="%r12"; +my $out="%r13"; - add \$8,$dat - movl -8($dat),$XX[0]#d - movl -4($dat),$YY#d +my @XX=("%r10","%rsi"); +my @TX=("%rax","%rbx"); +my $YY="%rcx"; +my $TY="%rdx"; + +$code.=<<___; + xor $XX[0],$XX[0] + xor $YY,$YY + + lea 8($dat),$dat + mov -8($dat),$XX[0]#b + mov -4($dat),$YY#b cmpl \$-1,256($dat) je .LRC4_CHAR + mov OPENSSL_ia32cap_P(%rip),%r8d + xor $TX[1],$TX[1] inc $XX[0]#b + sub $XX[0],$TX[1] + sub $inp,$out movl ($dat,$XX[0],4),$TX[0]#d - test \$-8,$len + test \$-16,$len jz .Lloop1 - jmp .Lloop8 + bt \$30,%r8d # Intel CPU? + jc .Lintel + and \$7,$TX[1] + lea 1($XX[0]),$XX[1] + jz .Loop8 + sub $TX[1],$len +.Loop8_warmup: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($inp),$TY#b + movb $TY#b,($out,$inp) + lea 1($inp),$inp + dec $TX[1] + jnz .Loop8_warmup + + lea 1($XX[0]),$XX[1] + jmp .Loop8 .align 16 -.Lloop8: +.Loop8: ___ for ($i=0;$i<8;$i++) { +$code.=<<___ if ($i==7); + add \$8,$XX[1]#b +___ $code.=<<___; add $TX[0]#b,$YY#b - mov $XX[0],$XX[1] movl ($dat,$YY,4),$TY#d - ror \$8,%rax # ror is redundant when $i=0 - inc $XX[1]#b - movl ($dat,$XX[1],4),$TX[1]#d - cmp $XX[1],$YY movl $TX[0]#d,($dat,$YY,4) - cmove $TX[0],$TX[1] - movl $TY#d,($dat,$XX[0],4) + movl `4*($i==7?-1:$i)`($dat,$XX[1],4),$TX[1]#d + ror \$8,%r8 # ror is redundant when $i=0 + movl $TY#d,4*$i($dat,$XX[0],4) add $TX[0]#b,$TY#b - movb ($dat,$TY,4),%al + movb ($dat,$TY,4),%r8b ___ -push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers +push(@TX,shift(@TX)); #push(@XX,shift(@XX)); # "rotate" registers } $code.=<<___; - ror \$8,%rax + add \$8,$XX[0]#b + ror \$8,%r8 sub \$8,$len - xor ($inp),%rax - add \$8,$inp - mov %rax,($out) - add \$8,$out + xor ($inp),%r8 + mov %r8,($out,$inp) + lea 8($inp),$inp test \$-8,$len - jnz .Lloop8 + jnz .Loop8 + cmp \$0,$len + jne .Lloop1 + jmp .Lexit + +.align 16 +.Lintel: + test \$-32,$len + jz .Lloop1 + and \$15,$TX[1] + jz .Loop16_is_hot + sub $TX[1],$len +.Loop16_warmup: + add $TX[0]#b,$YY#b + movl ($dat,$YY,4),$TY#d + movl $TX[0]#d,($dat,$YY,4) + movl $TY#d,($dat,$XX[0],4) + add $TY#b,$TX[0]#b + inc $XX[0]#b + movl ($dat,$TX[0],4),$TY#d + movl ($dat,$XX[0],4),$TX[0]#d + xorb ($inp),$TY#b + movb $TY#b,($out,$inp) + lea 1($inp),$inp + dec $TX[1] + jnz .Loop16_warmup + + mov $YY,$TX[1] + xor $YY,$YY + mov $TX[1]#b,$YY#b + +.Loop16_is_hot: + lea ($dat,$XX[0],4),$XX[1] +___ +sub RC4_loop { + my $i=shift; + my $j=$i<0?0:$i; + my $xmm="%xmm".($j&1); + + $code.=" add \$16,$XX[0]#b\n" if ($i==15); + $code.=" movdqu ($inp),%xmm2\n" if ($i==15); + $code.=" add $TX[0]#b,$YY#b\n" if ($i<=0); + $code.=" movl ($dat,$YY,4),$TY#d\n"; + $code.=" pxor %xmm0,%xmm2\n" if ($i==0); + $code.=" psllq \$8,%xmm1\n" if ($i==0); + $code.=" pxor $xmm,$xmm\n" if ($i<=1); + $code.=" movl $TX[0]#d,($dat,$YY,4)\n"; + $code.=" add $TY#b,$TX[0]#b\n"; + $code.=" movl `4*($j+1)`($XX[1]),$TX[1]#d\n" if ($i<15); + $code.=" movz $TX[0]#b,$TX[0]#d\n"; + $code.=" movl $TY#d,4*$j($XX[1])\n"; + $code.=" pxor %xmm1,%xmm2\n" if ($i==0); + $code.=" lea ($dat,$XX[0],4),$XX[1]\n" if ($i==15); + $code.=" add $TX[1]#b,$YY#b\n" if ($i<15); + $code.=" pinsrw \$`($j>>1)&7`,($dat,$TX[0],4),$xmm\n"; + $code.=" movdqu %xmm2,($out,$inp)\n" if ($i==0); + $code.=" lea 16($inp),$inp\n" if ($i==0); + $code.=" movl ($XX[1]),$TX[1]#d\n" if ($i==15); +} + RC4_loop(-1); +$code.=<<___; + jmp .Loop16_enter +.align 16 +.Loop16: +___ + +for ($i=0;$i<16;$i++) { + $code.=".Loop16_enter:\n" if ($i==1); + RC4_loop($i); + push(@TX,shift(@TX)); # "rotate" registers +} +$code.=<<___; + mov $YY,$TX[1] + xor $YY,$YY # keyword to partial register + sub \$16,$len + mov $TX[1]#b,$YY#b + test \$-16,$len + jnz .Loop16 + + psllq \$8,%xmm1 + pxor %xmm0,%xmm2 + pxor %xmm1,%xmm2 + movdqu %xmm2,($out,$inp) + lea 16($inp),$inp + cmp \$0,$len jne .Lloop1 jmp .Lexit @@ -152,9 +317,8 @@ $code.=<<___; movl ($dat,$TX[0],4),$TY#d movl ($dat,$XX[0],4),$TX[0]#d xorb ($inp),$TY#b - inc $inp - movb $TY#b,($out) - inc $out + movb $TY#b,($out,$inp) + lea 1($inp),$inp dec $len jnz .Lloop1 jmp .Lexit @@ -165,13 +329,11 @@ $code.=<<___; movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 - cmpl \$0,260($dat) - jnz .Lcloop1 jmp .Lcloop8 .align 16 .Lcloop8: - mov ($inp),%eax - mov 4($inp),%ebx + mov ($inp),%r8d + mov 4($inp),%r9d ___ # unroll 2x4-wise, because 64-bit rotates kill Intel P4... for ($i=0;$i<4;$i++) { @@ -188,8 +350,8 @@ $code.=<<___; mov $TX[0],$TX[1] .Lcmov$i: add $TX[0]#b,$TY#b - xor ($dat,$TY),%al - ror \$8,%eax + xor ($dat,$TY),%r8b + ror \$8,%r8d ___ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers } @@ -207,16 +369,16 @@ $code.=<<___; mov $TX[0],$TX[1] .Lcmov$i: add $TX[0]#b,$TY#b - xor ($dat,$TY),%bl - ror \$8,%ebx + xor ($dat,$TY),%r9b + ror \$8,%r9d ___ push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers } $code.=<<___; lea -8($len),$len - mov %eax,($out) + mov %r8d,($out) lea 8($inp),$inp - mov %ebx,4($out) + mov %r9d,4($out) lea 8($out),$out test \$-8,$len @@ -229,6 +391,7 @@ $code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b + movzb $YY#b,$YY#d movzb ($dat,$YY),$TY#d movb $TX[0]#b,($dat,$YY) movb $TY#b,($dat,$XX[0]) @@ -260,16 +423,16 @@ $code.=<<___; ret .size RC4,.-RC4 ___ +} $idx="%r8"; $ido="%r9"; $code.=<<___; -.extern OPENSSL_ia32cap_P -.globl RC4_set_key -.type RC4_set_key,\@function,3 +.globl private_RC4_set_key +.type private_RC4_set_key,\@function,3 .align 16 -RC4_set_key: +private_RC4_set_key: lea 8($dat),$dat lea ($inp,$len),$inp neg $len @@ -280,12 +443,9 @@ RC4_set_key: xor %r11,%r11 mov OPENSSL_ia32cap_P(%rip),$idx#d - bt \$20,$idx#d - jnc .Lw1stloop - bt \$30,$idx#d - setc $ido#b - mov $ido#d,260($dat) - jmp .Lc1stloop + bt \$20,$idx#d # RC4_CHAR? + jc .Lc1stloop + jmp .Lw1stloop .align 16 .Lw1stloop: @@ -339,7 +499,7 @@ RC4_set_key: mov %eax,-8($dat) mov %eax,-4($dat) ret -.size RC4_set_key,.-RC4_set_key +.size private_RC4_set_key,.-private_RC4_set_key .globl RC4_options .type RC4_options,\@abi-omnipotent @@ -348,18 +508,20 @@ RC4_options: lea .Lopts(%rip),%rax mov OPENSSL_ia32cap_P(%rip),%edx bt \$20,%edx - jnc .Ldone - add \$12,%rax + jc .L8xchar bt \$30,%edx jnc .Ldone - add \$13,%rax + add \$25,%rax + ret +.L8xchar: + add \$12,%rax .Ldone: ret .align 64 .Lopts: .asciz "rc4(8x,int)" .asciz "rc4(8x,char)" -.asciz "rc4(1x,char)" +.asciz "rc4(16x,int)" .asciz "RC4 for x86_64, CRYPTOGAMS by " .align 64 .size RC4_options,.-RC4_options @@ -482,22 +644,32 @@ key_se_handler: .rva .LSEH_end_RC4 .rva .LSEH_info_RC4 - .rva .LSEH_begin_RC4_set_key - .rva .LSEH_end_RC4_set_key - .rva .LSEH_info_RC4_set_key + .rva .LSEH_begin_private_RC4_set_key + .rva .LSEH_end_private_RC4_set_key + .rva .LSEH_info_private_RC4_set_key .section .xdata .align 8 .LSEH_info_RC4: .byte 9,0,0,0 .rva stream_se_handler -.LSEH_info_RC4_set_key: +.LSEH_info_private_RC4_set_key: .byte 9,0,0,0 .rva key_se_handler ___ } -$code =~ s/#([bwd])/$1/gm; +sub reg_part { +my ($reg,$conv)=@_; + if ($reg =~ /%r[0-9]+/) { $reg .= $conv; } + elsif ($conv eq "b") { $reg =~ s/%[er]([^x]+)x?/%$1l/; } + elsif ($conv eq "w") { $reg =~ s/%[er](.+)/%$1/; } + elsif ($conv eq "d") { $reg =~ s/%[er](.+)/%e$1/; } + return $reg; +} + +$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem; +$code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; diff --git a/src/lib/libcrypto/rc4/rc4.h b/src/lib/libcrypto/rc4/rc4.h index 29d1acccf5..88ceb46bc5 100644 --- a/src/lib/libcrypto/rc4/rc4.h +++ b/src/lib/libcrypto/rc4/rc4.h @@ -79,6 +79,7 @@ typedef struct rc4_key_st const char *RC4_options(void); void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data); +void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data); void RC4(RC4_KEY *key, size_t len, const unsigned char *indata, unsigned char *outdata); diff --git a/src/lib/libcrypto/rc4/rc4_skey.c b/src/lib/libcrypto/rc4/rc4_skey.c index b22c40b0bd..fda27636e7 100644 --- a/src/lib/libcrypto/rc4/rc4_skey.c +++ b/src/lib/libcrypto/rc4/rc4_skey.c @@ -85,7 +85,7 @@ const char *RC4_options(void) * Date: Wed, 14 Sep 1994 06:35:31 GMT */ -void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) +void private_RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) { register RC4_INT tmp; register int id1,id2; @@ -104,40 +104,6 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data) d[(n)]=d[id2]; \ d[id2]=tmp; } -#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM) -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ - defined(__INTEL__) || \ - defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) - if (sizeof(RC4_INT) > 1) { - /* - * Unlike all other x86 [and x86_64] implementations, - * Intel P4 core [including EM64T] was found to perform - * poorly with wider RC4_INT. Performance improvement - * for IA-32 hand-coded assembler turned out to be 2.8x - * if re-coded for RC4_CHAR! It's however inappropriate - * to just switch to RC4_CHAR for x86[_64], as non-P4 - * implementations suffer from significant performance - * losses then, e.g. PIII exhibits >2x deterioration, - * and so does Opteron. In order to assure optimal - * all-round performance, let us [try to] detect P4 at - * run-time by checking upon HTT bit in CPU capability - * vector and set up compressed key schedule, which is - * recognized by correspondingly updated assembler - * module... - * - */ - if (OPENSSL_ia32cap_P & (1<<28)) { - unsigned char *cp=(unsigned char *)d; - - for (i=0;i<256;i++) cp[i]=i; - for (i=0;i<256;i++) SK_LOOP(cp,i); - /* mark schedule as compressed! */ - d[256/sizeof(RC4_INT)]=-1; - return; - } - } -# endif -#endif for (i=0; i < 256; i++) d[i]=i; for (i=0; i < 256; i+=4) { diff --git a/src/lib/libcrypto/ripemd/ripemd.h b/src/lib/libcrypto/ripemd/ripemd.h index 5942eb6180..189bd8c90e 100644 --- a/src/lib/libcrypto/ripemd/ripemd.h +++ b/src/lib/libcrypto/ripemd/ripemd.h @@ -91,6 +91,9 @@ typedef struct RIPEMD160state_st unsigned int num; } RIPEMD160_CTX; +#ifdef OPENSSL_FIPS +int private_RIPEMD160_Init(RIPEMD160_CTX *c); +#endif int RIPEMD160_Init(RIPEMD160_CTX *c); int RIPEMD160_Update(RIPEMD160_CTX *c, const void *data, size_t len); int RIPEMD160_Final(unsigned char *md, RIPEMD160_CTX *c); diff --git a/src/lib/libcrypto/ripemd/rmd_dgst.c b/src/lib/libcrypto/ripemd/rmd_dgst.c index 59b017f8c0..63f0d983f7 100644 --- a/src/lib/libcrypto/ripemd/rmd_dgst.c +++ b/src/lib/libcrypto/ripemd/rmd_dgst.c @@ -59,6 +59,7 @@ #include #include "rmd_locl.h" #include +#include const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT; @@ -69,7 +70,7 @@ const char RMD160_version[]="RIPE-MD160" OPENSSL_VERSION_PTEXT; void ripemd160_block(RIPEMD160_CTX *c, unsigned long *p,size_t num); # endif -int RIPEMD160_Init(RIPEMD160_CTX *c) +fips_md_init(RIPEMD160) { memset (c,0,sizeof(*c)); c->A=RIPEMD160_A; diff --git a/src/lib/libcrypto/rsa/rsa.h b/src/lib/libcrypto/rsa/rsa.h index cf74343657..4814a2fc15 100644 --- a/src/lib/libcrypto/rsa/rsa.h +++ b/src/lib/libcrypto/rsa/rsa.h @@ -222,12 +222,22 @@ struct rsa_st EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, \ pad, NULL) +#define EVP_PKEY_CTX_get_rsa_padding(ctx, ppad) \ + EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, \ + EVP_PKEY_CTRL_GET_RSA_PADDING, 0, ppad) + #define EVP_PKEY_CTX_set_rsa_pss_saltlen(ctx, len) \ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \ (EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \ EVP_PKEY_CTRL_RSA_PSS_SALTLEN, \ len, NULL) +#define EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx, plen) \ + EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, \ + (EVP_PKEY_OP_SIGN|EVP_PKEY_OP_VERIFY), \ + EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, \ + 0, plen) + #define EVP_PKEY_CTX_set_rsa_keygen_bits(ctx, bits) \ EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \ EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL) @@ -236,11 +246,24 @@ struct rsa_st EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, \ EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, pubexp) +#define EVP_PKEY_CTX_set_rsa_mgf1_md(ctx, md) \ + EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG, \ + EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md) + +#define EVP_PKEY_CTX_get_rsa_mgf1_md(ctx, pmd) \ + EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_SIG, \ + EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)pmd) + #define EVP_PKEY_CTRL_RSA_PADDING (EVP_PKEY_ALG_CTRL + 1) #define EVP_PKEY_CTRL_RSA_PSS_SALTLEN (EVP_PKEY_ALG_CTRL + 2) #define EVP_PKEY_CTRL_RSA_KEYGEN_BITS (EVP_PKEY_ALG_CTRL + 3) #define EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP (EVP_PKEY_ALG_CTRL + 4) +#define EVP_PKEY_CTRL_RSA_MGF1_MD (EVP_PKEY_ALG_CTRL + 5) + +#define EVP_PKEY_CTRL_GET_RSA_PADDING (EVP_PKEY_ALG_CTRL + 6) +#define EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN (EVP_PKEY_ALG_CTRL + 7) +#define EVP_PKEY_CTRL_GET_RSA_MGF1_MD (EVP_PKEY_ALG_CTRL + 8) #define RSA_PKCS1_PADDING 1 #define RSA_SSLV23_PADDING 2 @@ -300,6 +323,16 @@ const RSA_METHOD *RSA_null_method(void); DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPublicKey) DECLARE_ASN1_ENCODE_FUNCTIONS_const(RSA, RSAPrivateKey) +typedef struct rsa_pss_params_st + { + X509_ALGOR *hashAlgorithm; + X509_ALGOR *maskGenAlgorithm; + ASN1_INTEGER *saltLength; + ASN1_INTEGER *trailerField; + } RSA_PSS_PARAMS; + +DECLARE_ASN1_FUNCTIONS(RSA_PSS_PARAMS) + #ifndef OPENSSL_NO_FP_API int RSA_print_fp(FILE *fp, const RSA *r,int offset); #endif @@ -380,6 +413,14 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, const unsigned char *mHash, const EVP_MD *Hash, int sLen); +int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, + const unsigned char *EM, int sLen); + +int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM, + const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen); + int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_new *new_func, CRYPTO_EX_dup *dup_func, CRYPTO_EX_free *free_func); int RSA_set_ex_data(RSA *r,int idx,void *arg); @@ -388,6 +429,25 @@ void *RSA_get_ex_data(const RSA *r, int idx); RSA *RSAPublicKey_dup(RSA *rsa); RSA *RSAPrivateKey_dup(RSA *rsa); +/* If this flag is set the RSA method is FIPS compliant and can be used + * in FIPS mode. This is set in the validated module method. If an + * application sets this flag in its own methods it is its responsibility + * to ensure the result is compliant. + */ + +#define RSA_FLAG_FIPS_METHOD 0x0400 + +/* If this flag is set the operations normally disabled in FIPS mode are + * permitted it is then the applications responsibility to ensure that the + * usage is compliant. + */ + +#define RSA_FLAG_NON_FIPS_ALLOW 0x0400 +/* Application has decided PRNG is good enough to generate a key: don't + * check. + */ +#define RSA_FLAG_CHECKED 0x0800 + /* BEGIN ERROR CODES */ /* The following lines are auto generated by the script mkerr.pl. Any changes * made after this point may be overwritten when the script is next run. @@ -405,6 +465,7 @@ void ERR_load_RSA_strings(void); #define RSA_F_PKEY_RSA_CTRL 143 #define RSA_F_PKEY_RSA_CTRL_STR 144 #define RSA_F_PKEY_RSA_SIGN 142 +#define RSA_F_PKEY_RSA_VERIFY 154 #define RSA_F_PKEY_RSA_VERIFYRECOVER 141 #define RSA_F_RSA_BUILTIN_KEYGEN 129 #define RSA_F_RSA_CHECK_KEY 123 @@ -413,6 +474,8 @@ void ERR_load_RSA_strings(void); #define RSA_F_RSA_EAY_PUBLIC_DECRYPT 103 #define RSA_F_RSA_EAY_PUBLIC_ENCRYPT 104 #define RSA_F_RSA_GENERATE_KEY 105 +#define RSA_F_RSA_GENERATE_KEY_EX 155 +#define RSA_F_RSA_ITEM_VERIFY 156 #define RSA_F_RSA_MEMORY_LOCK 130 #define RSA_F_RSA_NEW_METHOD 106 #define RSA_F_RSA_NULL 124 @@ -424,6 +487,7 @@ void ERR_load_RSA_strings(void); #define RSA_F_RSA_PADDING_ADD_NONE 107 #define RSA_F_RSA_PADDING_ADD_PKCS1_OAEP 121 #define RSA_F_RSA_PADDING_ADD_PKCS1_PSS 125 +#define RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1 148 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1 108 #define RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2 109 #define RSA_F_RSA_PADDING_ADD_SSLV23 110 @@ -436,8 +500,12 @@ void ERR_load_RSA_strings(void); #define RSA_F_RSA_PADDING_CHECK_X931 128 #define RSA_F_RSA_PRINT 115 #define RSA_F_RSA_PRINT_FP 116 +#define RSA_F_RSA_PRIVATE_DECRYPT 150 +#define RSA_F_RSA_PRIVATE_ENCRYPT 151 #define RSA_F_RSA_PRIV_DECODE 137 #define RSA_F_RSA_PRIV_ENCODE 138 +#define RSA_F_RSA_PUBLIC_DECRYPT 152 +#define RSA_F_RSA_PUBLIC_ENCRYPT 153 #define RSA_F_RSA_PUB_DECODE 139 #define RSA_F_RSA_SETUP_BLINDING 136 #define RSA_F_RSA_SIGN 117 @@ -445,6 +513,7 @@ void ERR_load_RSA_strings(void); #define RSA_F_RSA_VERIFY 119 #define RSA_F_RSA_VERIFY_ASN1_OCTET_STRING 120 #define RSA_F_RSA_VERIFY_PKCS1_PSS 126 +#define RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1 149 /* Reason codes. */ #define RSA_R_ALGORITHM_MISMATCH 100 @@ -470,19 +539,24 @@ void ERR_load_RSA_strings(void); #define RSA_R_INVALID_HEADER 137 #define RSA_R_INVALID_KEYBITS 145 #define RSA_R_INVALID_MESSAGE_LENGTH 131 +#define RSA_R_INVALID_MGF1_MD 156 #define RSA_R_INVALID_PADDING 138 #define RSA_R_INVALID_PADDING_MODE 141 +#define RSA_R_INVALID_PSS_PARAMETERS 149 #define RSA_R_INVALID_PSS_SALTLEN 146 +#define RSA_R_INVALID_SALT_LENGTH 150 #define RSA_R_INVALID_TRAILER 139 #define RSA_R_INVALID_X931_DIGEST 142 #define RSA_R_IQMP_NOT_INVERSE_OF_Q 126 #define RSA_R_KEY_SIZE_TOO_SMALL 120 #define RSA_R_LAST_OCTET_INVALID 134 #define RSA_R_MODULUS_TOO_LARGE 105 +#define RSA_R_NON_FIPS_RSA_METHOD 157 #define RSA_R_NO_PUBLIC_EXPONENT 140 #define RSA_R_NULL_BEFORE_BLOCK_MISSING 113 #define RSA_R_N_DOES_NOT_EQUAL_P_Q 127 #define RSA_R_OAEP_DECODING_ERROR 121 +#define RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE 158 #define RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE 148 #define RSA_R_PADDING_CHECK_FAILED 114 #define RSA_R_P_NOT_PRIME 128 @@ -493,7 +567,12 @@ void ERR_load_RSA_strings(void); #define RSA_R_SSLV3_ROLLBACK_ATTACK 115 #define RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD 116 #define RSA_R_UNKNOWN_ALGORITHM_TYPE 117 +#define RSA_R_UNKNOWN_MASK_DIGEST 151 #define RSA_R_UNKNOWN_PADDING_TYPE 118 +#define RSA_R_UNKNOWN_PSS_DIGEST 152 +#define RSA_R_UNSUPPORTED_MASK_ALGORITHM 153 +#define RSA_R_UNSUPPORTED_MASK_PARAMETER 154 +#define RSA_R_UNSUPPORTED_SIGNATURE_TYPE 155 #define RSA_R_VALUE_MISSING 147 #define RSA_R_WRONG_SIGNATURE_LENGTH 119 diff --git a/src/lib/libcrypto/rsa/rsa_ameth.c b/src/lib/libcrypto/rsa/rsa_ameth.c index 8c3209885e..2460910ab2 100644 --- a/src/lib/libcrypto/rsa/rsa_ameth.c +++ b/src/lib/libcrypto/rsa/rsa_ameth.c @@ -265,6 +265,147 @@ static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); } +static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg, + X509_ALGOR **pmaskHash) + { + const unsigned char *p; + int plen; + RSA_PSS_PARAMS *pss; + + *pmaskHash = NULL; + + if (!alg->parameter || alg->parameter->type != V_ASN1_SEQUENCE) + return NULL; + p = alg->parameter->value.sequence->data; + plen = alg->parameter->value.sequence->length; + pss = d2i_RSA_PSS_PARAMS(NULL, &p, plen); + + if (!pss) + return NULL; + + if (pss->maskGenAlgorithm) + { + ASN1_TYPE *param = pss->maskGenAlgorithm->parameter; + if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) == NID_mgf1 + && param->type == V_ASN1_SEQUENCE) + { + p = param->value.sequence->data; + plen = param->value.sequence->length; + *pmaskHash = d2i_X509_ALGOR(NULL, &p, plen); + } + } + + return pss; + } + +static int rsa_pss_param_print(BIO *bp, RSA_PSS_PARAMS *pss, + X509_ALGOR *maskHash, int indent) + { + int rv = 0; + if (!pss) + { + if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0) + return 0; + return 1; + } + if (BIO_puts(bp, "\n") <= 0) + goto err; + if (!BIO_indent(bp, indent, 128)) + goto err; + if (BIO_puts(bp, "Hash Algorithm: ") <= 0) + goto err; + + if (pss->hashAlgorithm) + { + if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0) + goto err; + } + else if (BIO_puts(bp, "sha1 (default)") <= 0) + goto err; + + if (BIO_puts(bp, "\n") <= 0) + goto err; + + if (!BIO_indent(bp, indent, 128)) + goto err; + + if (BIO_puts(bp, "Mask Algorithm: ") <= 0) + goto err; + if (pss->maskGenAlgorithm) + { + if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0) + goto err; + if (BIO_puts(bp, " with ") <= 0) + goto err; + if (maskHash) + { + if (i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0) + goto err; + } + else if (BIO_puts(bp, "INVALID") <= 0) + goto err; + } + else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0) + goto err; + BIO_puts(bp, "\n"); + + if (!BIO_indent(bp, indent, 128)) + goto err; + if (BIO_puts(bp, "Salt Length: ") <= 0) + goto err; + if (pss->saltLength) + { + if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0) + goto err; + } + else if (BIO_puts(bp, "20 (default)") <= 0) + goto err; + BIO_puts(bp, "\n"); + + if (!BIO_indent(bp, indent, 128)) + goto err; + if (BIO_puts(bp, "Trailer Field: ") <= 0) + goto err; + if (pss->trailerField) + { + if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0) + goto err; + } + else if (BIO_puts(bp, "0xbc (default)") <= 0) + goto err; + BIO_puts(bp, "\n"); + + rv = 1; + + err: + return rv; + + } + +static int rsa_sig_print(BIO *bp, const X509_ALGOR *sigalg, + const ASN1_STRING *sig, + int indent, ASN1_PCTX *pctx) + { + if (OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss) + { + int rv; + RSA_PSS_PARAMS *pss; + X509_ALGOR *maskHash; + pss = rsa_pss_decode(sigalg, &maskHash); + rv = rsa_pss_param_print(bp, pss, maskHash, indent); + if (pss) + RSA_PSS_PARAMS_free(pss); + if (maskHash) + X509_ALGOR_free(maskHash); + if (!rv) + return 0; + } + else if (!sig && BIO_puts(bp, "\n") <= 0) + return 0; + if (sig) + return X509_signature_dump(bp, sig, indent); + return 1; + } static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) { @@ -310,6 +451,211 @@ static int rsa_pkey_ctrl(EVP_PKEY *pkey, int op, long arg1, void *arg2) } +/* Customised RSA item verification routine. This is called + * when a signature is encountered requiring special handling. We + * currently only handle PSS. + */ + + +static int rsa_item_verify(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *sigalg, ASN1_BIT_STRING *sig, + EVP_PKEY *pkey) + { + int rv = -1; + int saltlen; + const EVP_MD *mgf1md = NULL, *md = NULL; + RSA_PSS_PARAMS *pss; + X509_ALGOR *maskHash; + EVP_PKEY_CTX *pkctx; + /* Sanity check: make sure it is PSS */ + if (OBJ_obj2nid(sigalg->algorithm) != NID_rsassaPss) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_SIGNATURE_TYPE); + return -1; + } + /* Decode PSS parameters */ + pss = rsa_pss_decode(sigalg, &maskHash); + + if (pss == NULL) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_PSS_PARAMETERS); + goto err; + } + /* Check mask and lookup mask hash algorithm */ + if (pss->maskGenAlgorithm) + { + if (OBJ_obj2nid(pss->maskGenAlgorithm->algorithm) != NID_mgf1) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_ALGORITHM); + goto err; + } + if (!maskHash) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNSUPPORTED_MASK_PARAMETER); + goto err; + } + mgf1md = EVP_get_digestbyobj(maskHash->algorithm); + if (mgf1md == NULL) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_MASK_DIGEST); + goto err; + } + } + else + mgf1md = EVP_sha1(); + + if (pss->hashAlgorithm) + { + md = EVP_get_digestbyobj(pss->hashAlgorithm->algorithm); + if (md == NULL) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_UNKNOWN_PSS_DIGEST); + goto err; + } + } + else + md = EVP_sha1(); + + if (pss->saltLength) + { + saltlen = ASN1_INTEGER_get(pss->saltLength); + + /* Could perform more salt length sanity checks but the main + * RSA routines will trap other invalid values anyway. + */ + if (saltlen < 0) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_SALT_LENGTH); + goto err; + } + } + else + saltlen = 20; + + /* low-level routines support only trailer field 0xbc (value 1) + * and PKCS#1 says we should reject any other value anyway. + */ + if (pss->trailerField && ASN1_INTEGER_get(pss->trailerField) != 1) + { + RSAerr(RSA_F_RSA_ITEM_VERIFY, RSA_R_INVALID_TRAILER); + goto err; + } + + /* We have all parameters now set up context */ + + if (!EVP_DigestVerifyInit(ctx, &pkctx, md, NULL, pkey)) + goto err; + + if (EVP_PKEY_CTX_set_rsa_padding(pkctx, RSA_PKCS1_PSS_PADDING) <= 0) + goto err; + + if (EVP_PKEY_CTX_set_rsa_pss_saltlen(pkctx, saltlen) <= 0) + goto err; + + if (EVP_PKEY_CTX_set_rsa_mgf1_md(pkctx, mgf1md) <= 0) + goto err; + /* Carry on */ + rv = 2; + + err: + RSA_PSS_PARAMS_free(pss); + if (maskHash) + X509_ALGOR_free(maskHash); + return rv; + } + +static int rsa_item_sign(EVP_MD_CTX *ctx, const ASN1_ITEM *it, void *asn, + X509_ALGOR *alg1, X509_ALGOR *alg2, + ASN1_BIT_STRING *sig) + { + int pad_mode; + EVP_PKEY_CTX *pkctx = ctx->pctx; + if (EVP_PKEY_CTX_get_rsa_padding(pkctx, &pad_mode) <= 0) + return 0; + if (pad_mode == RSA_PKCS1_PADDING) + return 2; + if (pad_mode == RSA_PKCS1_PSS_PADDING) + { + const EVP_MD *sigmd, *mgf1md; + RSA_PSS_PARAMS *pss = NULL; + X509_ALGOR *mgf1alg = NULL; + ASN1_STRING *os1 = NULL, *os2 = NULL; + EVP_PKEY *pk = EVP_PKEY_CTX_get0_pkey(pkctx); + int saltlen, rv = 0; + sigmd = EVP_MD_CTX_md(ctx); + if (EVP_PKEY_CTX_get_rsa_mgf1_md(pkctx, &mgf1md) <= 0) + goto err; + if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pkctx, &saltlen)) + goto err; + if (saltlen == -1) + saltlen = EVP_MD_size(sigmd); + else if (saltlen == -2) + { + saltlen = EVP_PKEY_size(pk) - EVP_MD_size(sigmd) - 2; + if (((EVP_PKEY_bits(pk) - 1) & 0x7) == 0) + saltlen--; + } + pss = RSA_PSS_PARAMS_new(); + if (!pss) + goto err; + if (saltlen != 20) + { + pss->saltLength = ASN1_INTEGER_new(); + if (!pss->saltLength) + goto err; + if (!ASN1_INTEGER_set(pss->saltLength, saltlen)) + goto err; + } + if (EVP_MD_type(sigmd) != NID_sha1) + { + pss->hashAlgorithm = X509_ALGOR_new(); + if (!pss->hashAlgorithm) + goto err; + X509_ALGOR_set_md(pss->hashAlgorithm, sigmd); + } + if (EVP_MD_type(mgf1md) != NID_sha1) + { + ASN1_STRING *stmp = NULL; + /* need to embed algorithm ID inside another */ + mgf1alg = X509_ALGOR_new(); + X509_ALGOR_set_md(mgf1alg, mgf1md); + if (!ASN1_item_pack(mgf1alg, ASN1_ITEM_rptr(X509_ALGOR), + &stmp)) + goto err; + pss->maskGenAlgorithm = X509_ALGOR_new(); + if (!pss->maskGenAlgorithm) + goto err; + X509_ALGOR_set0(pss->maskGenAlgorithm, + OBJ_nid2obj(NID_mgf1), + V_ASN1_SEQUENCE, stmp); + } + /* Finally create string with pss parameter encoding. */ + if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os1)) + goto err; + if (alg2) + { + os2 = ASN1_STRING_dup(os1); + if (!os2) + goto err; + X509_ALGOR_set0(alg2, OBJ_nid2obj(NID_rsassaPss), + V_ASN1_SEQUENCE, os2); + } + X509_ALGOR_set0(alg1, OBJ_nid2obj(NID_rsassaPss), + V_ASN1_SEQUENCE, os1); + os1 = os2 = NULL; + rv = 3; + err: + if (mgf1alg) + X509_ALGOR_free(mgf1alg); + if (pss) + RSA_PSS_PARAMS_free(pss); + if (os1) + ASN1_STRING_free(os1); + return rv; + + } + return 2; + } const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = { @@ -335,10 +681,13 @@ const EVP_PKEY_ASN1_METHOD rsa_asn1_meths[] = 0,0,0,0,0,0, + rsa_sig_print, int_rsa_free, rsa_pkey_ctrl, old_rsa_priv_decode, - old_rsa_priv_encode + old_rsa_priv_encode, + rsa_item_verify, + rsa_item_sign }, { diff --git a/src/lib/libcrypto/rsa/rsa_asn1.c b/src/lib/libcrypto/rsa/rsa_asn1.c index 4efca8cdc8..6ed5de3db4 100644 --- a/src/lib/libcrypto/rsa/rsa_asn1.c +++ b/src/lib/libcrypto/rsa/rsa_asn1.c @@ -60,6 +60,7 @@ #include "cryptlib.h" #include #include +#include #include /* Override the default free and new methods */ @@ -96,6 +97,15 @@ ASN1_SEQUENCE_cb(RSAPublicKey, rsa_cb) = { ASN1_SIMPLE(RSA, e, BIGNUM), } ASN1_SEQUENCE_END_cb(RSA, RSAPublicKey) +ASN1_SEQUENCE(RSA_PSS_PARAMS) = { + ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR,0), + ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR,1), + ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER,2), + ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER,3) +} ASN1_SEQUENCE_END(RSA_PSS_PARAMS) + +IMPLEMENT_ASN1_FUNCTIONS(RSA_PSS_PARAMS) + IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPrivateKey, RSAPrivateKey) IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(RSA, RSAPublicKey, RSAPublicKey) diff --git a/src/lib/libcrypto/rsa/rsa_crpt.c b/src/lib/libcrypto/rsa/rsa_crpt.c new file mode 100644 index 0000000000..d3e44785dc --- /dev/null +++ b/src/lib/libcrypto/rsa/rsa_crpt.c @@ -0,0 +1,257 @@ +/* crypto/rsa/rsa_lib.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +#include +#include +#include "cryptlib.h" +#include +#include +#include +#include +#ifndef OPENSSL_NO_ENGINE +#include +#endif + +int RSA_size(const RSA *r) + { + return(BN_num_bytes(r->n)); + } + +int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PUBLIC_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding)); + } + +int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PRIVATE_ENCRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding)); + } + +int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PRIVATE_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding)); + } + +int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to, + RSA *rsa, int padding) + { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_PUBLIC_DECRYPT, RSA_R_NON_FIPS_RSA_METHOD); + return -1; + } +#endif + return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding)); + } + +int RSA_flags(const RSA *r) + { + return((r == NULL)?0:r->meth->flags); + } + +void RSA_blinding_off(RSA *rsa) + { + if (rsa->blinding != NULL) + { + BN_BLINDING_free(rsa->blinding); + rsa->blinding=NULL; + } + rsa->flags &= ~RSA_FLAG_BLINDING; + rsa->flags |= RSA_FLAG_NO_BLINDING; + } + +int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) + { + int ret=0; + + if (rsa->blinding != NULL) + RSA_blinding_off(rsa); + + rsa->blinding = RSA_setup_blinding(rsa, ctx); + if (rsa->blinding == NULL) + goto err; + + rsa->flags |= RSA_FLAG_BLINDING; + rsa->flags &= ~RSA_FLAG_NO_BLINDING; + ret=1; +err: + return(ret); + } + +static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p, + const BIGNUM *q, BN_CTX *ctx) +{ + BIGNUM *ret = NULL, *r0, *r1, *r2; + + if (d == NULL || p == NULL || q == NULL) + return NULL; + + BN_CTX_start(ctx); + r0 = BN_CTX_get(ctx); + r1 = BN_CTX_get(ctx); + r2 = BN_CTX_get(ctx); + if (r2 == NULL) + goto err; + + if (!BN_sub(r1, p, BN_value_one())) goto err; + if (!BN_sub(r2, q, BN_value_one())) goto err; + if (!BN_mul(r0, r1, r2, ctx)) goto err; + + ret = BN_mod_inverse(NULL, d, r0, ctx); +err: + BN_CTX_end(ctx); + return ret; +} + +BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx) +{ + BIGNUM local_n; + BIGNUM *e,*n; + BN_CTX *ctx; + BN_BLINDING *ret = NULL; + + if (in_ctx == NULL) + { + if ((ctx = BN_CTX_new()) == NULL) return 0; + } + else + ctx = in_ctx; + + BN_CTX_start(ctx); + e = BN_CTX_get(ctx); + if (e == NULL) + { + RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE); + goto err; + } + + if (rsa->e == NULL) + { + e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx); + if (e == NULL) + { + RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT); + goto err; + } + } + else + e = rsa->e; + + + if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL) + { + /* if PRNG is not properly seeded, resort to secret + * exponent as unpredictable seed */ + RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0); + } + + if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) + { + /* Set BN_FLG_CONSTTIME flag */ + n = &local_n; + BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME); + } + else + n = rsa->n; + + ret = BN_BLINDING_create_param(NULL, e, n, ctx, + rsa->meth->bn_mod_exp, rsa->_method_mod_n); + if (ret == NULL) + { + RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB); + goto err; + } + CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret)); +err: + BN_CTX_end(ctx); + if (in_ctx == NULL) + BN_CTX_free(ctx); + if(rsa->e == NULL) + BN_free(e); + + return ret; +} diff --git a/src/lib/libcrypto/rsa/rsa_err.c b/src/lib/libcrypto/rsa/rsa_err.c index cf9f1106b0..46e0bf9980 100644 --- a/src/lib/libcrypto/rsa/rsa_err.c +++ b/src/lib/libcrypto/rsa/rsa_err.c @@ -1,6 +1,6 @@ /* crypto/rsa/rsa_err.c */ /* ==================================================================== - * Copyright (c) 1999-2008 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2011 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -78,6 +78,7 @@ static ERR_STRING_DATA RSA_str_functs[]= {ERR_FUNC(RSA_F_PKEY_RSA_CTRL), "PKEY_RSA_CTRL"}, {ERR_FUNC(RSA_F_PKEY_RSA_CTRL_STR), "PKEY_RSA_CTRL_STR"}, {ERR_FUNC(RSA_F_PKEY_RSA_SIGN), "PKEY_RSA_SIGN"}, +{ERR_FUNC(RSA_F_PKEY_RSA_VERIFY), "PKEY_RSA_VERIFY"}, {ERR_FUNC(RSA_F_PKEY_RSA_VERIFYRECOVER), "PKEY_RSA_VERIFYRECOVER"}, {ERR_FUNC(RSA_F_RSA_BUILTIN_KEYGEN), "RSA_BUILTIN_KEYGEN"}, {ERR_FUNC(RSA_F_RSA_CHECK_KEY), "RSA_check_key"}, @@ -86,6 +87,8 @@ static ERR_STRING_DATA RSA_str_functs[]= {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_DECRYPT), "RSA_EAY_PUBLIC_DECRYPT"}, {ERR_FUNC(RSA_F_RSA_EAY_PUBLIC_ENCRYPT), "RSA_EAY_PUBLIC_ENCRYPT"}, {ERR_FUNC(RSA_F_RSA_GENERATE_KEY), "RSA_generate_key"}, +{ERR_FUNC(RSA_F_RSA_GENERATE_KEY_EX), "RSA_generate_key_ex"}, +{ERR_FUNC(RSA_F_RSA_ITEM_VERIFY), "RSA_ITEM_VERIFY"}, {ERR_FUNC(RSA_F_RSA_MEMORY_LOCK), "RSA_memory_lock"}, {ERR_FUNC(RSA_F_RSA_NEW_METHOD), "RSA_new_method"}, {ERR_FUNC(RSA_F_RSA_NULL), "RSA_NULL"}, @@ -97,6 +100,7 @@ static ERR_STRING_DATA RSA_str_functs[]= {ERR_FUNC(RSA_F_RSA_PADDING_ADD_NONE), "RSA_padding_add_none"}, {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_OAEP), "RSA_padding_add_PKCS1_OAEP"}, {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS), "RSA_padding_add_PKCS1_PSS"}, +{ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1), "RSA_padding_add_PKCS1_PSS_mgf1"}, {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_1), "RSA_padding_add_PKCS1_type_1"}, {ERR_FUNC(RSA_F_RSA_PADDING_ADD_PKCS1_TYPE_2), "RSA_padding_add_PKCS1_type_2"}, {ERR_FUNC(RSA_F_RSA_PADDING_ADD_SSLV23), "RSA_padding_add_SSLv23"}, @@ -109,8 +113,12 @@ static ERR_STRING_DATA RSA_str_functs[]= {ERR_FUNC(RSA_F_RSA_PADDING_CHECK_X931), "RSA_padding_check_X931"}, {ERR_FUNC(RSA_F_RSA_PRINT), "RSA_print"}, {ERR_FUNC(RSA_F_RSA_PRINT_FP), "RSA_print_fp"}, +{ERR_FUNC(RSA_F_RSA_PRIVATE_DECRYPT), "RSA_private_decrypt"}, +{ERR_FUNC(RSA_F_RSA_PRIVATE_ENCRYPT), "RSA_private_encrypt"}, {ERR_FUNC(RSA_F_RSA_PRIV_DECODE), "RSA_PRIV_DECODE"}, {ERR_FUNC(RSA_F_RSA_PRIV_ENCODE), "RSA_PRIV_ENCODE"}, +{ERR_FUNC(RSA_F_RSA_PUBLIC_DECRYPT), "RSA_public_decrypt"}, +{ERR_FUNC(RSA_F_RSA_PUBLIC_ENCRYPT), "RSA_public_encrypt"}, {ERR_FUNC(RSA_F_RSA_PUB_DECODE), "RSA_PUB_DECODE"}, {ERR_FUNC(RSA_F_RSA_SETUP_BLINDING), "RSA_setup_blinding"}, {ERR_FUNC(RSA_F_RSA_SIGN), "RSA_sign"}, @@ -118,6 +126,7 @@ static ERR_STRING_DATA RSA_str_functs[]= {ERR_FUNC(RSA_F_RSA_VERIFY), "RSA_verify"}, {ERR_FUNC(RSA_F_RSA_VERIFY_ASN1_OCTET_STRING), "RSA_verify_ASN1_OCTET_STRING"}, {ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS), "RSA_verify_PKCS1_PSS"}, +{ERR_FUNC(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1), "RSA_verify_PKCS1_PSS_mgf1"}, {0,NULL} }; @@ -146,19 +155,24 @@ static ERR_STRING_DATA RSA_str_reasons[]= {ERR_REASON(RSA_R_INVALID_HEADER) ,"invalid header"}, {ERR_REASON(RSA_R_INVALID_KEYBITS) ,"invalid keybits"}, {ERR_REASON(RSA_R_INVALID_MESSAGE_LENGTH),"invalid message length"}, +{ERR_REASON(RSA_R_INVALID_MGF1_MD) ,"invalid mgf1 md"}, {ERR_REASON(RSA_R_INVALID_PADDING) ,"invalid padding"}, {ERR_REASON(RSA_R_INVALID_PADDING_MODE) ,"invalid padding mode"}, +{ERR_REASON(RSA_R_INVALID_PSS_PARAMETERS),"invalid pss parameters"}, {ERR_REASON(RSA_R_INVALID_PSS_SALTLEN) ,"invalid pss saltlen"}, +{ERR_REASON(RSA_R_INVALID_SALT_LENGTH) ,"invalid salt length"}, {ERR_REASON(RSA_R_INVALID_TRAILER) ,"invalid trailer"}, {ERR_REASON(RSA_R_INVALID_X931_DIGEST) ,"invalid x931 digest"}, {ERR_REASON(RSA_R_IQMP_NOT_INVERSE_OF_Q) ,"iqmp not inverse of q"}, {ERR_REASON(RSA_R_KEY_SIZE_TOO_SMALL) ,"key size too small"}, {ERR_REASON(RSA_R_LAST_OCTET_INVALID) ,"last octet invalid"}, {ERR_REASON(RSA_R_MODULUS_TOO_LARGE) ,"modulus too large"}, +{ERR_REASON(RSA_R_NON_FIPS_RSA_METHOD) ,"non fips rsa method"}, {ERR_REASON(RSA_R_NO_PUBLIC_EXPONENT) ,"no public exponent"}, {ERR_REASON(RSA_R_NULL_BEFORE_BLOCK_MISSING),"null before block missing"}, {ERR_REASON(RSA_R_N_DOES_NOT_EQUAL_P_Q) ,"n does not equal p q"}, {ERR_REASON(RSA_R_OAEP_DECODING_ERROR) ,"oaep decoding error"}, +{ERR_REASON(RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE),"operation not allowed in fips mode"}, {ERR_REASON(RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE),"operation not supported for this keytype"}, {ERR_REASON(RSA_R_PADDING_CHECK_FAILED) ,"padding check failed"}, {ERR_REASON(RSA_R_P_NOT_PRIME) ,"p not prime"}, @@ -169,7 +183,12 @@ static ERR_STRING_DATA RSA_str_reasons[]= {ERR_REASON(RSA_R_SSLV3_ROLLBACK_ATTACK) ,"sslv3 rollback attack"}, {ERR_REASON(RSA_R_THE_ASN1_OBJECT_IDENTIFIER_IS_NOT_KNOWN_FOR_THIS_MD),"the asn1 object identifier is not known for this md"}, {ERR_REASON(RSA_R_UNKNOWN_ALGORITHM_TYPE),"unknown algorithm type"}, +{ERR_REASON(RSA_R_UNKNOWN_MASK_DIGEST) ,"unknown mask digest"}, {ERR_REASON(RSA_R_UNKNOWN_PADDING_TYPE) ,"unknown padding type"}, +{ERR_REASON(RSA_R_UNKNOWN_PSS_DIGEST) ,"unknown pss digest"}, +{ERR_REASON(RSA_R_UNSUPPORTED_MASK_ALGORITHM),"unsupported mask algorithm"}, +{ERR_REASON(RSA_R_UNSUPPORTED_MASK_PARAMETER),"unsupported mask parameter"}, +{ERR_REASON(RSA_R_UNSUPPORTED_SIGNATURE_TYPE),"unsupported signature type"}, {ERR_REASON(RSA_R_VALUE_MISSING) ,"value missing"}, {ERR_REASON(RSA_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"}, {0,NULL} diff --git a/src/lib/libcrypto/rsa/rsa_gen.c b/src/lib/libcrypto/rsa/rsa_gen.c index 767f7ab682..42290cce66 100644 --- a/src/lib/libcrypto/rsa/rsa_gen.c +++ b/src/lib/libcrypto/rsa/rsa_gen.c @@ -67,6 +67,9 @@ #include "cryptlib.h" #include #include +#ifdef OPENSSL_FIPS +#include +#endif static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb); @@ -77,8 +80,20 @@ static int rsa_builtin_keygen(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) * now just because key-generation is part of RSA_METHOD. */ int RSA_generate_key_ex(RSA *rsa, int bits, BIGNUM *e_value, BN_GENCB *cb) { +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_GENERATE_KEY_EX, RSA_R_NON_FIPS_RSA_METHOD); + return 0; + } +#endif if(rsa->meth->rsa_keygen) return rsa->meth->rsa_keygen(rsa, bits, e_value, cb); +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_rsa_generate_key_ex(rsa, bits, e_value, cb); +#endif return rsa_builtin_keygen(rsa, bits, e_value, cb); } diff --git a/src/lib/libcrypto/rsa/rsa_lib.c b/src/lib/libcrypto/rsa/rsa_lib.c index de45088d76..c95ceafc82 100644 --- a/src/lib/libcrypto/rsa/rsa_lib.c +++ b/src/lib/libcrypto/rsa/rsa_lib.c @@ -67,6 +67,10 @@ #include #endif +#ifdef OPENSSL_FIPS +#include +#endif + const char RSA_version[]="RSA" OPENSSL_VERSION_PTEXT; static const RSA_METHOD *default_RSA_meth=NULL; @@ -87,11 +91,14 @@ const RSA_METHOD *RSA_get_default_method(void) { if (default_RSA_meth == NULL) { +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return FIPS_rsa_pkcs1_ssleay(); + else + return RSA_PKCS1_SSLeay(); +#else #ifdef RSA_NULL default_RSA_meth=RSA_null_method(); -#else -#if 0 /* was: #ifdef RSAref */ - default_RSA_meth=RSA_PKCS1_RSAref(); #else default_RSA_meth=RSA_PKCS1_SSLeay(); #endif @@ -181,7 +188,7 @@ RSA *RSA_new_method(ENGINE *engine) ret->blinding=NULL; ret->mt_blinding=NULL; ret->bignum_data=NULL; - ret->flags=ret->meth->flags; + ret->flags=ret->meth->flags & ~RSA_FLAG_NON_FIPS_ALLOW; if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_RSA, ret, &ret->ex_data)) { #ifndef OPENSSL_NO_ENGINE @@ -280,163 +287,6 @@ void *RSA_get_ex_data(const RSA *r, int idx) return(CRYPTO_get_ex_data(&r->ex_data,idx)); } -int RSA_size(const RSA *r) - { - return(BN_num_bytes(r->n)); - } - -int RSA_public_encrypt(int flen, const unsigned char *from, unsigned char *to, - RSA *rsa, int padding) - { - return(rsa->meth->rsa_pub_enc(flen, from, to, rsa, padding)); - } - -int RSA_private_encrypt(int flen, const unsigned char *from, unsigned char *to, - RSA *rsa, int padding) - { - return(rsa->meth->rsa_priv_enc(flen, from, to, rsa, padding)); - } - -int RSA_private_decrypt(int flen, const unsigned char *from, unsigned char *to, - RSA *rsa, int padding) - { - return(rsa->meth->rsa_priv_dec(flen, from, to, rsa, padding)); - } - -int RSA_public_decrypt(int flen, const unsigned char *from, unsigned char *to, - RSA *rsa, int padding) - { - return(rsa->meth->rsa_pub_dec(flen, from, to, rsa, padding)); - } - -int RSA_flags(const RSA *r) - { - return((r == NULL)?0:r->meth->flags); - } - -void RSA_blinding_off(RSA *rsa) - { - if (rsa->blinding != NULL) - { - BN_BLINDING_free(rsa->blinding); - rsa->blinding=NULL; - } - rsa->flags &= ~RSA_FLAG_BLINDING; - rsa->flags |= RSA_FLAG_NO_BLINDING; - } - -int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) - { - int ret=0; - - if (rsa->blinding != NULL) - RSA_blinding_off(rsa); - - rsa->blinding = RSA_setup_blinding(rsa, ctx); - if (rsa->blinding == NULL) - goto err; - - rsa->flags |= RSA_FLAG_BLINDING; - rsa->flags &= ~RSA_FLAG_NO_BLINDING; - ret=1; -err: - return(ret); - } - -static BIGNUM *rsa_get_public_exp(const BIGNUM *d, const BIGNUM *p, - const BIGNUM *q, BN_CTX *ctx) -{ - BIGNUM *ret = NULL, *r0, *r1, *r2; - - if (d == NULL || p == NULL || q == NULL) - return NULL; - - BN_CTX_start(ctx); - r0 = BN_CTX_get(ctx); - r1 = BN_CTX_get(ctx); - r2 = BN_CTX_get(ctx); - if (r2 == NULL) - goto err; - - if (!BN_sub(r1, p, BN_value_one())) goto err; - if (!BN_sub(r2, q, BN_value_one())) goto err; - if (!BN_mul(r0, r1, r2, ctx)) goto err; - - ret = BN_mod_inverse(NULL, d, r0, ctx); -err: - BN_CTX_end(ctx); - return ret; -} - -BN_BLINDING *RSA_setup_blinding(RSA *rsa, BN_CTX *in_ctx) -{ - BIGNUM local_n; - BIGNUM *e,*n; - BN_CTX *ctx; - BN_BLINDING *ret = NULL; - - if (in_ctx == NULL) - { - if ((ctx = BN_CTX_new()) == NULL) return 0; - } - else - ctx = in_ctx; - - BN_CTX_start(ctx); - e = BN_CTX_get(ctx); - if (e == NULL) - { - RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (rsa->e == NULL) - { - e = rsa_get_public_exp(rsa->d, rsa->p, rsa->q, ctx); - if (e == NULL) - { - RSAerr(RSA_F_RSA_SETUP_BLINDING, RSA_R_NO_PUBLIC_EXPONENT); - goto err; - } - } - else - e = rsa->e; - - - if ((RAND_status() == 0) && rsa->d != NULL && rsa->d->d != NULL) - { - /* if PRNG is not properly seeded, resort to secret - * exponent as unpredictable seed */ - RAND_add(rsa->d->d, rsa->d->dmax * sizeof rsa->d->d[0], 0.0); - } - - if (!(rsa->flags & RSA_FLAG_NO_CONSTTIME)) - { - /* Set BN_FLG_CONSTTIME flag */ - n = &local_n; - BN_with_flags(n, rsa->n, BN_FLG_CONSTTIME); - } - else - n = rsa->n; - - ret = BN_BLINDING_create_param(NULL, e, n, ctx, - rsa->meth->bn_mod_exp, rsa->_method_mod_n); - if (ret == NULL) - { - RSAerr(RSA_F_RSA_SETUP_BLINDING, ERR_R_BN_LIB); - goto err; - } - CRYPTO_THREADID_current(BN_BLINDING_thread_id(ret)); -err: - BN_CTX_end(ctx); - if (in_ctx == NULL) - BN_CTX_free(ctx); - if(rsa->e == NULL) - BN_free(e); - - return ret; -} - int RSA_memory_lock(RSA *r) { int i,j,k,off; diff --git a/src/lib/libcrypto/rsa/rsa_oaep.c b/src/lib/libcrypto/rsa/rsa_oaep.c index 18d307ea9e..553d212ebe 100644 --- a/src/lib/libcrypto/rsa/rsa_oaep.c +++ b/src/lib/libcrypto/rsa/rsa_oaep.c @@ -56,7 +56,8 @@ int RSA_padding_add_PKCS1_OAEP(unsigned char *to, int tlen, seed = to + 1; db = to + SHA_DIGEST_LENGTH + 1; - EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL); + if (!EVP_Digest((void *)param, plen, db, NULL, EVP_sha1(), NULL)) + return 0; memset(db + SHA_DIGEST_LENGTH, 0, emlen - flen - 2 * SHA_DIGEST_LENGTH - 1); db[emlen - flen - SHA_DIGEST_LENGTH - 1] = 0x01; @@ -145,7 +146,8 @@ int RSA_padding_check_PKCS1_OAEP(unsigned char *to, int tlen, for (i = 0; i < dblen; i++) db[i] ^= maskeddb[i]; - EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL); + if (!EVP_Digest((void *)param, plen, phash, NULL, EVP_sha1(), NULL)) + return -1; if (memcmp(db, phash, SHA_DIGEST_LENGTH) != 0 || bad) goto decoding_err; diff --git a/src/lib/libcrypto/rsa/rsa_pmeth.c b/src/lib/libcrypto/rsa/rsa_pmeth.c index c6892ecd09..5b2ecf56ad 100644 --- a/src/lib/libcrypto/rsa/rsa_pmeth.c +++ b/src/lib/libcrypto/rsa/rsa_pmeth.c @@ -63,6 +63,12 @@ #include #include #include +#ifndef OPENSSL_NO_CMS +#include +#endif +#ifdef OPENSSL_FIPS +#include +#endif #include "evp_locl.h" #include "rsa_locl.h" @@ -79,6 +85,8 @@ typedef struct int pad_mode; /* message digest */ const EVP_MD *md; + /* message digest for MGF1 */ + const EVP_MD *mgf1md; /* PSS/OAEP salt length */ int saltlen; /* Temp buffer */ @@ -95,6 +103,7 @@ static int pkey_rsa_init(EVP_PKEY_CTX *ctx) rctx->pub_exp = NULL; rctx->pad_mode = RSA_PKCS1_PADDING; rctx->md = NULL; + rctx->mgf1md = NULL; rctx->tbuf = NULL; rctx->saltlen = -2; @@ -147,6 +156,31 @@ static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx) OPENSSL_free(rctx); } } +#ifdef OPENSSL_FIPS +/* FIP checker. Return value indicates status of context parameters: + * 1 : redirect to FIPS. + * 0 : don't redirect to FIPS. + * -1 : illegal operation in FIPS mode. + */ + +static int pkey_fips_check_ctx(EVP_PKEY_CTX *ctx) + { + RSA_PKEY_CTX *rctx = ctx->data; + RSA *rsa = ctx->pkey->pkey.rsa; + int rv = -1; + if (!FIPS_mode()) + return 0; + if (rsa->flags & RSA_FLAG_NON_FIPS_ALLOW) + rv = 0; + if (!(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) && rv) + return -1; + if (rctx->md && !(rctx->md->flags & EVP_MD_FLAG_FIPS)) + return rv; + if (rctx->mgf1md && !(rctx->mgf1md->flags & EVP_MD_FLAG_FIPS)) + return rv; + return 1; + } +#endif static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, const unsigned char *tbs, size_t tbslen) @@ -155,6 +189,15 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, RSA_PKEY_CTX *rctx = ctx->data; RSA *rsa = ctx->pkey->pkey.rsa; +#ifdef OPENSSL_FIPS + ret = pkey_fips_check_ctx(ctx); + if (ret < 0) + { + RSAerr(RSA_F_PKEY_RSA_SIGN, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); + return -1; + } +#endif + if (rctx->md) { if (tbslen != (size_t)EVP_MD_size(rctx->md)) @@ -163,7 +206,36 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, RSA_R_INVALID_DIGEST_LENGTH); return -1; } - if (rctx->pad_mode == RSA_X931_PADDING) +#ifdef OPENSSL_FIPS + if (ret > 0) + { + unsigned int slen; + ret = FIPS_rsa_sign_digest(rsa, tbs, tbslen, rctx->md, + rctx->pad_mode, + rctx->saltlen, + rctx->mgf1md, + sig, &slen); + if (ret > 0) + *siglen = slen; + else + *siglen = 0; + return ret; + } +#endif + + if (EVP_MD_type(rctx->md) == NID_mdc2) + { + unsigned int sltmp; + if (rctx->pad_mode != RSA_PKCS1_PADDING) + return -1; + ret = RSA_sign_ASN1_OCTET_STRING(NID_mdc2, + tbs, tbslen, sig, &sltmp, rsa); + + if (ret <= 0) + return ret; + ret = sltmp; + } + else if (rctx->pad_mode == RSA_X931_PADDING) { if (!setup_tbuf(rctx, ctx)) return -1; @@ -186,8 +258,10 @@ static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, unsigned char *sig, size_t *siglen, { if (!setup_tbuf(rctx, ctx)) return -1; - if (!RSA_padding_add_PKCS1_PSS(rsa, rctx->tbuf, tbs, - rctx->md, rctx->saltlen)) + if (!RSA_padding_add_PKCS1_PSS_mgf1(rsa, + rctx->tbuf, tbs, + rctx->md, rctx->mgf1md, + rctx->saltlen)) return -1; ret = RSA_private_encrypt(RSA_size(rsa), rctx->tbuf, sig, rsa, RSA_NO_PADDING); @@ -269,8 +343,30 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, RSA_PKEY_CTX *rctx = ctx->data; RSA *rsa = ctx->pkey->pkey.rsa; size_t rslen; +#ifdef OPENSSL_FIPS + int rv; + rv = pkey_fips_check_ctx(ctx); + if (rv < 0) + { + RSAerr(RSA_F_PKEY_RSA_VERIFY, RSA_R_OPERATION_NOT_ALLOWED_IN_FIPS_MODE); + return -1; + } +#endif if (rctx->md) { +#ifdef OPENSSL_FIPS + if (rv > 0) + { + return FIPS_rsa_verify_digest(rsa, + tbs, tbslen, + rctx->md, + rctx->pad_mode, + rctx->saltlen, + rctx->mgf1md, + sig, siglen); + + } +#endif if (rctx->pad_mode == RSA_PKCS1_PADDING) return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, sig, siglen, rsa); @@ -289,7 +385,8 @@ static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, rsa, RSA_NO_PADDING); if (ret <= 0) return 0; - ret = RSA_verify_PKCS1_PSS(rsa, tbs, rctx->md, + ret = RSA_verify_PKCS1_PSS_mgf1(rsa, tbs, + rctx->md, rctx->mgf1md, rctx->tbuf, rctx->saltlen); if (ret <= 0) return 0; @@ -403,15 +500,25 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) RSA_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); return -2; + case EVP_PKEY_CTRL_GET_RSA_PADDING: + *(int *)p2 = rctx->pad_mode; + return 1; + case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: - if (p1 < -2) - return -2; + case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN: if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) { RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_PSS_SALTLEN); return -2; } - rctx->saltlen = p1; + if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN) + *(int *)p2 = rctx->saltlen; + else + { + if (p1 < -2) + return -2; + rctx->saltlen = p1; + } return 1; case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: @@ -435,16 +542,45 @@ static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) rctx->md = p2; return 1; + case EVP_PKEY_CTRL_RSA_MGF1_MD: + case EVP_PKEY_CTRL_GET_RSA_MGF1_MD: + if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) + { + RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_INVALID_MGF1_MD); + return -2; + } + if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD) + { + if (rctx->mgf1md) + *(const EVP_MD **)p2 = rctx->mgf1md; + else + *(const EVP_MD **)p2 = rctx->md; + } + else + rctx->mgf1md = p2; + return 1; + case EVP_PKEY_CTRL_DIGESTINIT: case EVP_PKEY_CTRL_PKCS7_ENCRYPT: case EVP_PKEY_CTRL_PKCS7_DECRYPT: case EVP_PKEY_CTRL_PKCS7_SIGN: + return 1; #ifndef OPENSSL_NO_CMS - case EVP_PKEY_CTRL_CMS_ENCRYPT: case EVP_PKEY_CTRL_CMS_DECRYPT: + { + X509_ALGOR *alg = NULL; + ASN1_OBJECT *encalg = NULL; + if (p2) + CMS_RecipientInfo_ktri_get0_algs(p2, NULL, NULL, &alg); + if (alg) + X509_ALGOR_get0(&encalg, NULL, NULL, alg); + if (encalg && OBJ_obj2nid(encalg) == NID_rsaesOaep) + rctx->pad_mode = RSA_PKCS1_OAEP_PADDING; + } + case EVP_PKEY_CTRL_CMS_ENCRYPT: case EVP_PKEY_CTRL_CMS_SIGN: -#endif return 1; +#endif case EVP_PKEY_CTRL_PEER_KEY: RSAerr(RSA_F_PKEY_RSA_CTRL, RSA_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); diff --git a/src/lib/libcrypto/rsa/rsa_pss.c b/src/lib/libcrypto/rsa/rsa_pss.c index ac211e2ffe..5f9f533d0c 100644 --- a/src/lib/libcrypto/rsa/rsa_pss.c +++ b/src/lib/libcrypto/rsa/rsa_pss.c @@ -73,6 +73,13 @@ static const unsigned char zeroes[] = {0,0,0,0,0,0,0,0}; int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, const EVP_MD *Hash, const unsigned char *EM, int sLen) { + return RSA_verify_PKCS1_PSS_mgf1(rsa, mHash, Hash, NULL, EM, sLen); + } + +int RSA_verify_PKCS1_PSS_mgf1(RSA *rsa, const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, + const unsigned char *EM, int sLen) + { int i; int ret = 0; int hLen, maskedDBLen, MSBits, emLen; @@ -80,6 +87,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, unsigned char *DB = NULL; EVP_MD_CTX ctx; unsigned char H_[EVP_MAX_MD_SIZE]; + EVP_MD_CTX_init(&ctx); + + if (mgf1Hash == NULL) + mgf1Hash = Hash; hLen = EVP_MD_size(Hash); if (hLen < 0) @@ -94,7 +105,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, else if (sLen == -2) sLen = -2; else if (sLen < -2) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); goto err; } @@ -102,7 +113,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, emLen = RSA_size(rsa); if (EM[0] & (0xFF << MSBits)) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_FIRST_OCTET_INVALID); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_FIRST_OCTET_INVALID); goto err; } if (MSBits == 0) @@ -112,12 +123,12 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, } if (emLen < (hLen + sLen + 2)) /* sLen can be small negative */ { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_DATA_TOO_LARGE); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_DATA_TOO_LARGE); goto err; } if (EM[emLen - 1] != 0xbc) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_LAST_OCTET_INVALID); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_LAST_OCTET_INVALID); goto err; } maskedDBLen = emLen - hLen - 1; @@ -125,10 +136,10 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, DB = OPENSSL_malloc(maskedDBLen); if (!DB) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, ERR_R_MALLOC_FAILURE); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, ERR_R_MALLOC_FAILURE); goto err; } - if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, Hash) < 0) + if (PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash) < 0) goto err; for (i = 0; i < maskedDBLen; i++) DB[i] ^= EM[i]; @@ -137,25 +148,28 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, for (i = 0; DB[i] == 0 && i < (maskedDBLen-1); i++) ; if (DB[i++] != 0x1) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_RECOVERY_FAILED); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_RECOVERY_FAILED); goto err; } if (sLen >= 0 && (maskedDBLen - i) != sLen) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); goto err; } - EVP_MD_CTX_init(&ctx); - EVP_DigestInit_ex(&ctx, Hash, NULL); - EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); - EVP_DigestUpdate(&ctx, mHash, hLen); + if (!EVP_DigestInit_ex(&ctx, Hash, NULL) + || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) + || !EVP_DigestUpdate(&ctx, mHash, hLen)) + goto err; if (maskedDBLen - i) - EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i); - EVP_DigestFinal(&ctx, H_, NULL); - EVP_MD_CTX_cleanup(&ctx); + { + if (!EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i)) + goto err; + } + if (!EVP_DigestFinal_ex(&ctx, H_, NULL)) + goto err; if (memcmp(H_, H, hLen)) { - RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS, RSA_R_BAD_SIGNATURE); + RSAerr(RSA_F_RSA_VERIFY_PKCS1_PSS_MGF1, RSA_R_BAD_SIGNATURE); ret = 0; } else @@ -164,6 +178,7 @@ int RSA_verify_PKCS1_PSS(RSA *rsa, const unsigned char *mHash, err: if (DB) OPENSSL_free(DB); + EVP_MD_CTX_cleanup(&ctx); return ret; @@ -173,12 +188,22 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, const unsigned char *mHash, const EVP_MD *Hash, int sLen) { + return RSA_padding_add_PKCS1_PSS_mgf1(rsa, EM, mHash, Hash, NULL, sLen); + } + +int RSA_padding_add_PKCS1_PSS_mgf1(RSA *rsa, unsigned char *EM, + const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, int sLen) + { int i; int ret = 0; int hLen, maskedDBLen, MSBits, emLen; unsigned char *H, *salt = NULL, *p; EVP_MD_CTX ctx; + if (mgf1Hash == NULL) + mgf1Hash = Hash; + hLen = EVP_MD_size(Hash); if (hLen < 0) goto err; @@ -192,7 +217,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, else if (sLen == -2) sLen = -2; else if (sLen < -2) { - RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, RSA_R_SLEN_CHECK_FAILED); + RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1, RSA_R_SLEN_CHECK_FAILED); goto err; } @@ -209,8 +234,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, } else if (emLen < (hLen + sLen + 2)) { - RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, - RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); goto err; } if (sLen > 0) @@ -218,8 +242,7 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, salt = OPENSSL_malloc(sLen); if (!salt) { - RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS, - ERR_R_MALLOC_FAILURE); + RSAerr(RSA_F_RSA_PADDING_ADD_PKCS1_PSS_MGF1,ERR_R_MALLOC_FAILURE); goto err; } if (RAND_bytes(salt, sLen) <= 0) @@ -228,16 +251,18 @@ int RSA_padding_add_PKCS1_PSS(RSA *rsa, unsigned char *EM, maskedDBLen = emLen - hLen - 1; H = EM + maskedDBLen; EVP_MD_CTX_init(&ctx); - EVP_DigestInit_ex(&ctx, Hash, NULL); - EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes); - EVP_DigestUpdate(&ctx, mHash, hLen); - if (sLen) - EVP_DigestUpdate(&ctx, salt, sLen); - EVP_DigestFinal(&ctx, H, NULL); + if (!EVP_DigestInit_ex(&ctx, Hash, NULL) + || !EVP_DigestUpdate(&ctx, zeroes, sizeof zeroes) + || !EVP_DigestUpdate(&ctx, mHash, hLen)) + goto err; + if (sLen && !EVP_DigestUpdate(&ctx, salt, sLen)) + goto err; + if (!EVP_DigestFinal_ex(&ctx, H, NULL)) + goto err; EVP_MD_CTX_cleanup(&ctx); /* Generate dbMask in place then perform XOR on it */ - if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, Hash)) + if (PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash)) goto err; p = EM; diff --git a/src/lib/libcrypto/rsa/rsa_sign.c b/src/lib/libcrypto/rsa/rsa_sign.c index 0be4ec7fb0..b6f6037ae0 100644 --- a/src/lib/libcrypto/rsa/rsa_sign.c +++ b/src/lib/libcrypto/rsa/rsa_sign.c @@ -77,6 +77,14 @@ int RSA_sign(int type, const unsigned char *m, unsigned int m_len, const unsigned char *s = NULL; X509_ALGOR algor; ASN1_OCTET_STRING digest; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_RSA_SIGN, RSA_R_NON_FIPS_RSA_METHOD); + return 0; + } +#endif if((rsa->flags & RSA_FLAG_SIGN_VER) && rsa->meth->rsa_sign) { return rsa->meth->rsa_sign(type, m, m_len, @@ -153,6 +161,15 @@ int int_rsa_verify(int dtype, const unsigned char *m, unsigned char *s; X509_SIG *sig=NULL; +#ifdef OPENSSL_FIPS + if (FIPS_mode() && !(rsa->meth->flags & RSA_FLAG_FIPS_METHOD) + && !(rsa->flags & RSA_FLAG_NON_FIPS_ALLOW)) + { + RSAerr(RSA_F_INT_RSA_VERIFY, RSA_R_NON_FIPS_RSA_METHOD); + return 0; + } +#endif + if (siglen != (unsigned int)RSA_size(rsa)) { RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_WRONG_SIGNATURE_LENGTH); @@ -182,6 +199,22 @@ int int_rsa_verify(int dtype, const unsigned char *m, i=RSA_public_decrypt((int)siglen,sigbuf,s,rsa,RSA_PKCS1_PADDING); if (i <= 0) goto err; + /* Oddball MDC2 case: signature can be OCTET STRING. + * check for correct tag and length octets. + */ + if (dtype == NID_mdc2 && i == 18 && s[0] == 0x04 && s[1] == 0x10) + { + if (rm) + { + memcpy(rm, s + 2, 16); + *prm_len = 16; + ret = 1; + } + else if(memcmp(m, s + 2, 16)) + RSAerr(RSA_F_INT_RSA_VERIFY,RSA_R_BAD_SIGNATURE); + else + ret = 1; + } /* Special case: SSL signature */ if(dtype == NID_md5_sha1) { diff --git a/src/lib/libcrypto/s390xcap.c b/src/lib/libcrypto/s390xcap.c index ffbe0235f9..f2e94ef47e 100644 --- a/src/lib/libcrypto/s390xcap.c +++ b/src/lib/libcrypto/s390xcap.c @@ -4,7 +4,7 @@ #include #include -extern unsigned long OPENSSL_s390xcap_P; +extern unsigned long OPENSSL_s390xcap_P[]; static sigjmp_buf ill_jmp; static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); } @@ -16,7 +16,9 @@ void OPENSSL_cpuid_setup(void) sigset_t oset; struct sigaction ill_act,oact; - if (OPENSSL_s390xcap_P) return; + if (OPENSSL_s390xcap_P[0]) return; + + OPENSSL_s390xcap_P[0] = 1UL<<(8*sizeof(unsigned long)-1); memset(&ill_act,0,sizeof(ill_act)); ill_act.sa_handler = ill_handler; @@ -27,10 +29,8 @@ void OPENSSL_cpuid_setup(void) sigaction (SIGILL,&ill_act,&oact); /* protection against missing store-facility-list-extended */ - if (sigsetjmp(ill_jmp,0) == 0) - OPENSSL_s390xcap_P = OPENSSL_s390x_facilities(); - else - OPENSSL_s390xcap_P = 1UL<<63; + if (sigsetjmp(ill_jmp,1) == 0) + OPENSSL_s390x_facilities(); sigaction (SIGILL,&oact,NULL); sigprocmask(SIG_SETMASK,&oset,NULL); diff --git a/src/lib/libcrypto/s390xcpuid.S b/src/lib/libcrypto/s390xcpuid.S index b053c6a281..06815347e6 100644 --- a/src/lib/libcrypto/s390xcpuid.S +++ b/src/lib/libcrypto/s390xcpuid.S @@ -5,10 +5,14 @@ .align 16 OPENSSL_s390x_facilities: lghi %r0,0 - .long 0xb2b0f010 # stfle 16(%r15) - lg %r2,16(%r15) - larl %r1,OPENSSL_s390xcap_P - stg %r2,0(%r1) + larl %r2,OPENSSL_s390xcap_P + stg %r0,8(%r2) + .long 0xb2b02000 # stfle 0(%r2) + brc 8,.Ldone + lghi %r0,1 + .long 0xb2b02000 # stfle 0(%r2) +.Ldone: + lg %r2,0(%r2) br %r14 .size OPENSSL_s390x_facilities,.-OPENSSL_s390x_facilities @@ -58,6 +62,9 @@ OPENSSL_wipe_cpu: .type OPENSSL_cleanse,@function .align 16 OPENSSL_cleanse: +#if !defined(__s390x__) && !defined(__s390x) + llgfr %r3,%r3 +#endif lghi %r4,15 lghi %r0,0 clgr %r3,%r4 @@ -89,4 +96,4 @@ OPENSSL_cleanse: .section .init brasl %r14,OPENSSL_cpuid_setup -.comm OPENSSL_s390xcap_P,8,8 +.comm OPENSSL_s390xcap_P,16,8 diff --git a/src/lib/libcrypto/sha/asm/sha1-586.pl b/src/lib/libcrypto/sha/asm/sha1-586.pl index a1f876281a..1084d227fe 100644 --- a/src/lib/libcrypto/sha/asm/sha1-586.pl +++ b/src/lib/libcrypto/sha/asm/sha1-586.pl @@ -12,6 +12,8 @@ # commentary below], and in 2006 the rest was rewritten in order to # gain freedom to liberate licensing terms. +# January, September 2004. +# # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* # SHA1 assembler implementation. To address this problem (and @@ -31,12 +33,92 @@ # ---------------------------------------------------------------- # +# August 2009. +# +# George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as +# '(c&d) + (b&(c^d))', which allows to accumulate partial results +# and lighten "pressure" on scratch registers. This resulted in +# >12% performance improvement on contemporary AMD cores (with no +# degradation on other CPUs:-). Also, the code was revised to maximize +# "distance" between instructions producing input to 'lea' instruction +# and the 'lea' instruction itself, which is essential for Intel Atom +# core and resulted in ~15% improvement. + +# October 2010. +# +# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it +# is to offload message schedule denoted by Wt in NIST specification, +# or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, +# and in SSE2 context was first explored by Dean Gaudet in 2004, see +# http://arctic.org/~dean/crypto/sha1.html. Since then several things +# have changed that made it interesting again: +# +# a) XMM units became faster and wider; +# b) instruction set became more versatile; +# c) an important observation was made by Max Locktykhin, which made +# it possible to reduce amount of instructions required to perform +# the operation in question, for further details see +# http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. + +# April 2011. +# +# Add AVX code path, probably most controversial... The thing is that +# switch to AVX alone improves performance by as little as 4% in +# comparison to SSSE3 code path. But below result doesn't look like +# 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as +# pair of µ-ops, and it's the additional µ-ops, two per round, that +# make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded +# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with +# equivalent 'sh[rl]d' that is responsible for the impressive 5.1 +# cycles per processed byte. But 'sh[rl]d' is not something that used +# to be fast, nor does it appear to be fast in upcoming Bulldozer +# [according to its optimization manual]. Which is why AVX code path +# is guarded by *both* AVX and synthetic bit denoting Intel CPUs. +# One can argue that it's unfair to AMD, but without 'sh[rl]d' it +# makes no sense to keep the AVX code path. If somebody feels that +# strongly, it's probably more appropriate to discuss possibility of +# using vector rotate XOP on AMD... + +###################################################################### +# Current performance is summarized in following table. Numbers are +# CPU clock cycles spent to process single byte (less is better). +# +# x86 SSSE3 AVX +# Pentium 15.7 - +# PIII 11.5 - +# P4 10.6 - +# AMD K8 7.1 - +# Core2 7.3 6.1/+20% - +# Atom 12.5 9.5(*)/+32% - +# Westmere 7.3 5.6/+30% - +# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% +# +# (*) Loop is 1056 instructions long and expected result is ~8.25. +# It remains mystery [to me] why ILP is limited to 1.7. +# +# (**) As per above comment, the result is for AVX *plus* sh[rl]d. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); +$xmm=$ymm=0; +for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } + +$ymm=1 if ($xmm && + `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); # first version supporting AVX + +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.03); # first version supporting AVX + +&external_label("OPENSSL_ia32cap_P") if ($xmm); + + $A="eax"; $B="ebx"; $C="ecx"; @@ -47,6 +129,10 @@ $tmp1="ebp"; @V=($A,$B,$C,$D,$E,$T); +$alt=0; # 1 denotes alternative IALU implementation, which performs + # 8% *worse* on P4, same on Westmere and Atom, 2% better on + # Sandy Bridge... + sub BODY_00_15 { local($n,$a,$b,$c,$d,$e,$f)=@_; @@ -59,16 +145,18 @@ sub BODY_00_15 &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); &add($tmp1,$e); # tmp1+=e; - &and($f,$b); - &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded + &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded # with xi, also note that e becomes # f in next round... - &xor($f,$d); # f holds F_00_19(b,c,d) + &and($f,$b); &rotr($b,2); # b=ROTATE(b,30) - &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi + &xor($f,$d); # f holds F_00_19(b,c,d) + &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi - if ($n==15) { &add($f,$tmp1); } # f+=tmp1 + if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round + &add($f,$tmp1); } # f+=tmp1 else { &add($tmp1,$f); } # f becomes a in next round + &mov($tmp1,$a) if ($alt && $n==15); } sub BODY_16_19 @@ -77,22 +165,41 @@ sub BODY_16_19 &comment("16_19 $n"); - &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) - &xor($f,&swtmp(($n+2)%16)); - &xor($tmp1,$d); - &xor($f,&swtmp(($n+8)%16)); - &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) - &rotr($b,2); # b=ROTATE(b,30) +if ($alt) { + &xor($c,$d); + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d + &xor($f,&swtmp(($n+8)%16)); + &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($e,$tmp1); # e+=F_00_19(b,c,d) + &xor($c,$d); # restore $c + &mov($tmp1,$a); # b in next round + &rotr($b,$n==16?2:7); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f); # xi=f + &rotl($a,5); # ROTATE(a,5) + &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) +} else { + &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) - &mov(&swtmp($n%16),$f); # xi=f - &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e - &mov($e,$a); # e becomes volatile - &rotl($e,5); # e=ROTATE(a,5) - &add($f,$tmp1); # f+=F_00_19(b,c,d) - &add($f,$e); # f+=ROTATE(a,5) + &add($e,$tmp1); # e+=F_00_19(b,c,d) + &mov($tmp1,$a); + &rotr($b,2); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f); # xi=f + &rotl($tmp1,5); # ROTATE(a,5) + &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$tmp1); # f+=ROTATE(a,5) +} } sub BODY_20_39 @@ -102,21 +209,41 @@ sub BODY_20_39 &comment("20_39 $n"); +if ($alt) { + &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) + &xor($f,&swtmp(($n+8)%16)); + &add($e,$tmp1); # e+=F_20_39(b,c,d) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &mov($tmp1,$a); # b in next round + &rotr($b,7); # b=ROTATE(b,30) + &mov(&swtmp($n%16),$f) if($n<77);# xi=f + &rotl($a,5); # ROTATE(a,5) + &xor($b,$c) if($n==39);# warm up for BODY_40_59 + &and($tmp1,$b) if($n==39); + &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY + &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) + &rotr($a,5) if ($n==79); +} else { &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) - &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &rotr($b,2); # b=ROTATE(b,30) - &xor($f,&swtmp(($n+2)%16)); + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$c); &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) - &add($tmp1,$e); - &mov(&swtmp($n%16),$f); # xi=f - &mov($e,$a); # e becomes volatile - &rotl($e,5); # e=ROTATE(a,5) - &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e - &add($f,$e); # f+=ROTATE(a,5) + &add($e,$tmp1); # e+=F_20_39(b,c,d) + &rotr($b,2); # b=ROTATE(b,30) + &mov($tmp1,$a); + &rotl($tmp1,5); # ROTATE(a,5) + &mov(&swtmp($n%16),$f) if($n<77);# xi=f + &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY + &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round + &add($f,$tmp1); # f+=ROTATE(a,5) +} } sub BODY_40_59 @@ -125,41 +252,86 @@ sub BODY_40_59 &comment("40_59 $n"); - &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &mov($tmp1,&swtmp(($n+2)%16)); - &xor($f,$tmp1); - &mov($tmp1,&swtmp(($n+8)%16)); - &xor($f,$tmp1); - &mov($tmp1,&swtmp(($n+13)%16)); - &xor($f,$tmp1); # f holds xa^xb^xc^xd - &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d) +if ($alt) { + &add($e,$tmp1); # e+=b&(c^d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &mov($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &xor($c,$d); # restore $c + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) - &or($tmp1,$c); - &mov(&swtmp($n%16),$f); # xi=f - &and($tmp1,$d); - &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e - &mov($e,$b); # e becomes volatile and is used - # to calculate F_40_59(b,c,d) + &and($tmp1,$c); + &rotr($b,7); # b=ROTATE(b,30) + &add($e,$tmp1); # e+=c&d + &mov($tmp1,$a); # b in next round + &mov(&swtmp($n%16),$f); # xi=f + &rotl($a,5); # ROTATE(a,5) + &xor($b,$c) if ($n<59); + &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) + &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$a); # f+=ROTATE(a,5) +} else { + &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) + &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &xor($tmp1,$d); + &xor($f,&swtmp(($n+8)%16)); + &and($tmp1,$b); + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) + &add($tmp1,$e); # b&(c^d)+=e &rotr($b,2); # b=ROTATE(b,30) - &and($e,$c); - &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d) - &mov($e,$a); - &rotl($e,5); # e=ROTATE(a,5) - &add($f,$tmp1); # f+=tmp1; + &mov($e,$a); # e becomes volatile + &rotl($e,5); # ROTATE(a,5) + &mov(&swtmp($n%16),$f); # xi=f + &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) + &mov($tmp1,$c); &add($f,$e); # f+=ROTATE(a,5) + &and($tmp1,$d); + &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round + &add($f,$tmp1); # f+=c&d +} } &function_begin("sha1_block_data_order"); +if ($xmm) { + &static_label("ssse3_shortcut"); + &static_label("avx_shortcut") if ($ymm); + &static_label("K_XX_XX"); + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); + + &mov ($A,&DWP(0,$T)); + &mov ($D,&DWP(4,$T)); + &test ($D,1<<9); # check SSSE3 bit + &jz (&label("x86")); + &test ($A,1<<24); # check FXSR bit + &jz (&label("x86")); + if ($ymm) { + &and ($D,1<<28); # mask AVX bit + &and ($A,1<<30); # mask "Intel CPU" bit + &or ($A,$D); + &cmp ($A,1<<28|1<<30); + &je (&label("avx_shortcut")); + } + &jmp (&label("ssse3_shortcut")); + &set_label("x86",16); +} &mov($tmp1,&wparam(0)); # SHA_CTX *c &mov($T,&wparam(1)); # const void *input &mov($A,&wparam(2)); # size_t num - &stack_push(16); # allocate X[16] + &stack_push(16+3); # allocate X[16] &shl($A,6); &add($A,$T); &mov(&wparam(2),$A); # pointer beyond the end of input &mov($E,&DWP(16,$tmp1));# pre-load E + &jmp(&label("loop")); - &set_label("loop",16); +&set_label("loop",16); # copy input chunk to X, but reversing byte order! for ($i=0; $i<16; $i+=4) @@ -213,8 +385,845 @@ sub BODY_40_59 &mov(&DWP(16,$tmp1),$C); &jb(&label("loop")); - &stack_pop(16); + &stack_pop(16+3); &function_end("sha1_block_data_order"); + +if ($xmm) { +###################################################################### +# The SSSE3 implementation. +# +# %xmm[0-7] are used as ring @X[] buffer containing quadruples of last +# 32 elements of the message schedule or Xupdate outputs. First 4 +# quadruples are simply byte-swapped input, next 4 are calculated +# according to method originally suggested by Dean Gaudet (modulo +# being implemented in SSSE3). Once 8 quadruples or 32 elements are +# collected, it switches to routine proposed by Max Locktyukhin. +# +# Calculations inevitably require temporary reqisters, and there are +# no %xmm registers left to spare. For this reason part of the ring +# buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring +# buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - +# X[-5], and X[4] - X[-4]... +# +# Another notable optimization is aggressive stack frame compression +# aiming to minimize amount of 9-byte instructions... +# +# Yet another notable optimization is "jumping" $B variable. It means +# that there is no register permanently allocated for $B value. This +# allowed to eliminate one instruction from body_20_39... +# +my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded +my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 +my @V=($A,$B,$C,$D,$E); +my $j=0; # hash round +my @T=($T,$tmp1); +my $inp; + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +&function_begin("_sha1_block_data_order_ssse3"); + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); +&set_label("ssse3_shortcut"); + + &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 + &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 + &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 + &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 + &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask + + &mov ($E,&wparam(0)); # load argument block + &mov ($inp=@T[1],&wparam(1)); + &mov ($D,&wparam(2)); + &mov (@T[0],"esp"); + + # stack frame layout + # + # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area + # X[4]+K X[5]+K X[6]+K X[7]+K + # X[8]+K X[9]+K X[10]+K X[11]+K + # X[12]+K X[13]+K X[14]+K X[15]+K + # + # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area + # X[4] X[5] X[6] X[7] + # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 + # + # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants + # K_40_59 K_40_59 K_40_59 K_40_59 + # K_60_79 K_60_79 K_60_79 K_60_79 + # K_00_19 K_00_19 K_00_19 K_00_19 + # pbswap mask + # + # +192 ctx # argument block + # +196 inp + # +200 end + # +204 esp + &sub ("esp",208); + &and ("esp",-64); + + &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants + &movdqa (&QWP(112+16,"esp"),@X[5]); + &movdqa (&QWP(112+32,"esp"),@X[6]); + &shl ($D,6); # len*64 + &movdqa (&QWP(112+48,"esp"),@X[3]); + &add ($D,$inp); # end of input + &movdqa (&QWP(112+64,"esp"),@X[2]); + &add ($inp,64); + &mov (&DWP(192+0,"esp"),$E); # save argument block + &mov (&DWP(192+4,"esp"),$inp); + &mov (&DWP(192+8,"esp"),$D); + &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp + + &mov ($A,&DWP(0,$E)); # load context + &mov ($B,&DWP(4,$E)); + &mov ($C,&DWP(8,$E)); + &mov ($D,&DWP(12,$E)); + &mov ($E,&DWP(16,$E)); + &mov (@T[0],$B); # magic seed + + &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] + &movdqu (@X[-3&7],&QWP(-48,$inp)); + &movdqu (@X[-2&7],&QWP(-32,$inp)); + &movdqu (@X[-1&7],&QWP(-16,$inp)); + &pshufb (@X[-4&7],@X[2]); # byte swap + &pshufb (@X[-3&7],@X[2]); + &pshufb (@X[-2&7],@X[2]); + &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + &pshufb (@X[-1&7],@X[2]); + &paddd (@X[-4&7],@X[3]); # add K_00_19 + &paddd (@X[-3&7],@X[3]); + &paddd (@X[-2&7],@X[3]); + &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU + &psubd (@X[-4&7],@X[3]); # restore X[] + &movdqa (&QWP(0+16,"esp"),@X[-3&7]); + &psubd (@X[-3&7],@X[3]); + &movdqa (&QWP(0+32,"esp"),@X[-2&7]); + &psubd (@X[-2&7],@X[3]); + &movdqa (@X[0],@X[-3&7]); + &jmp (&label("loop")); + +###################################################################### +# SSE instruction sequence is first broken to groups of indepentent +# instructions, independent in respect to their inputs and shifter +# (not all architectures have more than one). Then IALU instructions +# are "knitted in" between the SSE groups. Distance is maintained for +# SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer +# [which allegedly also implements SSSE3]... +# +# Temporary registers usage. X[2] is volatile at the entry and at the +# end is restored from backtrace ring buffer. X[3] is expected to +# contain current K_XX_XX constant and is used to caclulate X[-1]+K +# from previous round, it becomes volatile the moment the value is +# saved to stack for transfer to IALU. X[4] becomes volatile whenever +# X[-4] is accumulated and offloaded to backtrace ring buffer, at the +# end it is loaded with next K_XX_XX [which becomes X[3] in next +# round]... +# +sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + &movdqa (@X[2],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + + &paddd (@X[3],@X[-1&7]); + &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq (@X[2],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@X[4],@X[0]); + &movdqa (@X[2],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslldq (@X[4],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@X[2],31); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@X[3],@X[4]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@X[4],30); + &por (@X[0],@X[2]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@X[3],2); + &pxor (@X[0],@X[4]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 + &movdqa (@X[1],@X[-2&7]) if ($Xi<7); + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@X[2],@X[-1&7]) if ($Xi==8); + eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &palignr(@X[2],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + if ($Xi%5) { + &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); + } + &paddd (@X[3],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &movdqa (@X[2],@X[0]); + &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pslld (@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &psrld (@X[2],30); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &por (@X[0],@X[2]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + &movdqa (@X[3],@X[0]) if ($Xi<19); + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &paddd (@X[3],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &mov ($inp=@T[1],&DWP(192+4,"esp")); + &cmp ($inp,&DWP(192+8,"esp")); + &je (&label("done")); + + &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 + &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask + &movdqu (@X[-4&7],&QWP(0,$inp)); # load input + &movdqu (@X[-3&7],&QWP(16,$inp)); + &movdqu (@X[-2&7],&QWP(32,$inp)); + &movdqu (@X[-1&7],&QWP(48,$inp)); + &add ($inp,64); + &pshufb (@X[-4&7],@X[2]); # byte swap + &mov (&DWP(192+4,"esp"),$inp); + &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@X[3]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer + '&xor ($c,$d);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&xor ($c,$d);', # restore $c + '&xor (@T[0],$d);', + '&add ($e,$a);', + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_20_39 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer + '&xor (@T[0],$d);', # ($b^$d) + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&xor (@T[0],$c);', # ($b^$d^$c) + '&add ($e,$a);', + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_40_59 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&mov (@T[1],$c);', + '&xor ($c,$d);', + '&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer + '&and (@T[1],$d);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[1]);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor ($c,$d);', # restore $c + '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +&set_label("loop",16); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov ($B,@T[0]); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); + &movdqa (@X[0],@X[-3&7]); + + &jmp (&label("loop")); + +&set_label("done",16); $j=$saved_j; @V=@saved_V; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &mov ("esp",&DWP(192+12,"esp")); # restore %esp + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); + +&function_end("_sha1_block_data_order_ssse3"); + +if ($ymm) { +my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded +my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 +my @V=($A,$B,$C,$D,$E); +my $j=0; # hash round +my @T=($T,$tmp1); +my $inp; + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +&function_begin("_sha1_block_data_order_avx"); + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop($tmp1); + &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); +&set_label("avx_shortcut"); + &vzeroall(); + + &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 + &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 + &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 + &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 + &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask + + &mov ($E,&wparam(0)); # load argument block + &mov ($inp=@T[1],&wparam(1)); + &mov ($D,&wparam(2)); + &mov (@T[0],"esp"); + + # stack frame layout + # + # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area + # X[4]+K X[5]+K X[6]+K X[7]+K + # X[8]+K X[9]+K X[10]+K X[11]+K + # X[12]+K X[13]+K X[14]+K X[15]+K + # + # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area + # X[4] X[5] X[6] X[7] + # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 + # + # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants + # K_40_59 K_40_59 K_40_59 K_40_59 + # K_60_79 K_60_79 K_60_79 K_60_79 + # K_00_19 K_00_19 K_00_19 K_00_19 + # pbswap mask + # + # +192 ctx # argument block + # +196 inp + # +200 end + # +204 esp + &sub ("esp",208); + &and ("esp",-64); + + &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants + &vmovdqa(&QWP(112+16,"esp"),@X[5]); + &vmovdqa(&QWP(112+32,"esp"),@X[6]); + &shl ($D,6); # len*64 + &vmovdqa(&QWP(112+48,"esp"),@X[3]); + &add ($D,$inp); # end of input + &vmovdqa(&QWP(112+64,"esp"),@X[2]); + &add ($inp,64); + &mov (&DWP(192+0,"esp"),$E); # save argument block + &mov (&DWP(192+4,"esp"),$inp); + &mov (&DWP(192+8,"esp"),$D); + &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp + + &mov ($A,&DWP(0,$E)); # load context + &mov ($B,&DWP(4,$E)); + &mov ($C,&DWP(8,$E)); + &mov ($D,&DWP(12,$E)); + &mov ($E,&DWP(16,$E)); + &mov (@T[0],$B); # magic seed + + &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] + &vmovdqu(@X[-3&7],&QWP(-48,$inp)); + &vmovdqu(@X[-2&7],&QWP(-32,$inp)); + &vmovdqu(@X[-1&7],&QWP(-16,$inp)); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &vpshufb(@X[-3&7],@X[-3&7],@X[2]); + &vpshufb(@X[-2&7],@X[-2&7],@X[2]); + &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + &vpshufb(@X[-1&7],@X[-1&7],@X[2]); + &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 + &vpaddd (@X[1],@X[-3&7],@X[3]); + &vpaddd (@X[2],@X[-2&7],@X[3]); + &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU + &vmovdqa(&QWP(0+16,"esp"),@X[1]); + &vmovdqa(&QWP(0+32,"esp"),@X[2]); + &jmp (&label("loop")); + +sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@X[3],@X[3],@X[-1&7]); + &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@X[2],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@X[3],@X[4],30); + &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@X[4],@X[4],2); + &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); + if ($Xi%5) { + &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); + } + &vpaddd (@X[3],@X[3],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@X[2],@X[0],30); + &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@X[3],@X[3],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &mov ($inp=@T[1],&DWP(192+4,"esp")); + &cmp ($inp,&DWP(192+8,"esp")); + &je (&label("done")); + + &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 + &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask + &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input + &vmovdqu(@X[-3&7],&QWP(16,$inp)); + &vmovdqu(@X[-2&7],&QWP(32,$inp)); + &vmovdqu(@X[-1&7],&QWP(48,$inp)); + &add ($inp,64); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &mov (&DWP(192+4,"esp"),$inp); + &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +&set_label("loop",16); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov ($B,@T[0]); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); + + &jmp (&label("loop")); + +&set_label("done",16); $j=$saved_j; @V=@saved_V; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + + &vzeroall(); + + &mov (@T[1],&DWP(192,"esp")); # update context + &add ($A,&DWP(0,@T[1])); + &mov ("esp",&DWP(192+12,"esp")); # restore %esp + &add (@T[0],&DWP(4,@T[1])); # $b + &add ($C,&DWP(8,@T[1])); + &mov (&DWP(0,@T[1]),$A); + &add ($D,&DWP(12,@T[1])); + &mov (&DWP(4,@T[1]),@T[0]); + &add ($E,&DWP(16,@T[1])); + &mov (&DWP(8,@T[1]),$C); + &mov (&DWP(12,@T[1]),$D); + &mov (&DWP(16,@T[1]),$E); +&function_end("_sha1_block_data_order_avx"); +} +&set_label("K_XX_XX",64); +&data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 +&data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 +&data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 +&data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 +&data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask +} &asciz("SHA1 block transform for x86, CRYPTOGAMS by "); &asm_finish(); diff --git a/src/lib/libcrypto/sha/asm/sha1-alpha.pl b/src/lib/libcrypto/sha/asm/sha1-alpha.pl new file mode 100644 index 0000000000..6c4b9251fd --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-alpha.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for Alpha. + +# On 21264 performance is 33% better than code generated by vendor +# compiler, and 75% better than GCC [3.4], and in absolute terms is +# 8.7 cycles per processed byte. Implementation features vectorized +# byte swap, but not Xupdate. + +@X=( "\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", + "\$8", "\$9", "\$10", "\$11", "\$12", "\$13", "\$14", "\$15"); +$ctx="a0"; # $16 +$inp="a1"; +$num="a2"; +$A="a3"; +$B="a4"; # 20 +$C="a5"; +$D="t8"; +$E="t9"; @V=($A,$B,$C,$D,$E); +$t0="t10"; # 24 +$t1="t11"; +$t2="ra"; +$t3="t12"; +$K="AT"; # 28 + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i==0); + ldq_u @X[0],0+0($inp) + ldq_u @X[1],0+7($inp) +___ +$code.=<<___ if (!($i&1) && $i<14); + ldq_u @X[$i+2],($i+2)*4+0($inp) + ldq_u @X[$i+3],($i+2)*4+7($inp) +___ +$code.=<<___ if (!($i&1) && $i<15); + extql @X[$i],$inp,@X[$i] + extqh @X[$i+1],$inp,@X[$i+1] + + or @X[$i+1],@X[$i],@X[$i] # pair of 32-bit values are fetched + + srl @X[$i],24,$t0 # vectorized byte swap + srl @X[$i],8,$t2 + + sll @X[$i],8,$t3 + sll @X[$i],24,@X[$i] + zapnot $t0,0x11,$t0 + zapnot $t2,0x22,$t2 + + zapnot @X[$i],0x88,@X[$i] + or $t0,$t2,$t0 + zapnot $t3,0x44,$t3 + sll $a,5,$t1 + + or @X[$i],$t0,@X[$i] + addl $K,$e,$e + and $b,$c,$t2 + zapnot $a,0xf,$a + + or @X[$i],$t3,@X[$i] + srl $a,27,$t0 + bic $d,$b,$t3 + sll $b,30,$b + + extll @X[$i],4,@X[$i+1] # extract upper half + or $t2,$t3,$t2 + addl @X[$i],$e,$e + + addl $t1,$e,$e + srl $b,32,$t3 + zapnot @X[$i],0xf,@X[$i] + + addl $t0,$e,$e + addl $t2,$e,$e + or $t3,$b,$b +___ +$code.=<<___ if (($i&1) && $i<15); + sll $a,5,$t1 + addl $K,$e,$e + and $b,$c,$t2 + zapnot $a,0xf,$a + + srl $a,27,$t0 + addl @X[$i%16],$e,$e + bic $d,$b,$t3 + sll $b,30,$b + + or $t2,$t3,$t2 + addl $t1,$e,$e + srl $b,32,$t3 + zapnot @X[$i],0xf,@X[$i] + + addl $t0,$e,$e + addl $t2,$e,$e + or $t3,$b,$b +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + and $b,$c,$t2 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + zapnot $a,0xf,$a + addl @X[$i%16],$e,$e + bic $d,$b,$t3 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + srl $a,27,$t0 + addl $t1,$e,$e + or $t2,$t3,$t2 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$b + addl $t0,$e,$e + srl @X[$j%16],31,$t1 + + addl $t2,$e,$e + srl $b,32,$t3 + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + zapnot @X[$i%16],0xf,@X[$i%16] + or $t1,@X[$j%16],@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$t3 + addl $t1,$e,$e + xor $b,$c,$t2 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + srl $b,2,$b + addl @X[$i%16],$e,$e + xor $d,$t2,$t2 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + srl @X[$j%16],31,$t1 + addl $t2,$e,$e + srl $a,27,$t0 + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + addl $t0,$e,$e + or $t1,@X[$j%16],@X[$j%16] +___ +$code.=<<___ if ($i<77); + zapnot @X[$i%16],0xf,@X[$i%16] +___ +$code.=<<___ if ($i==79); # with context fetch + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + ldl @X[0],0($ctx) + + sll $b,30,$t3 + addl $t1,$e,$e + xor $b,$c,$t2 + ldl @X[1],4($ctx) + + srl $b,2,$b + addl @X[$i%16],$e,$e + xor $d,$t2,$t2 + ldl @X[2],8($ctx) + + srl $a,27,$t0 + addl $t2,$e,$e + ldl @X[3],12($ctx) + + or $t3,$b,$b + addl $t0,$e,$e + ldl @X[4],16($ctx) +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; # with forward Xupdate + sll $a,5,$t1 + addl $K,$e,$e + zapnot $a,0xf,$a + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + + srl $a,27,$t0 + and $b,$c,$t2 + and $b,$d,$t3 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + + sll $b,30,$b + addl $t1,$e,$e + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + + srl @X[$j%16],31,$t1 + addl $t0,$e,$e + or $t2,$t3,$t2 + and $c,$d,$t3 + + or $t2,$t3,$t2 + srl $b,32,$t3 + addl @X[$i%16],$e,$e + addl @X[$j%16],@X[$j%16],@X[$j%16] + + or $t3,$b,$b + addl $t2,$e,$e + or $t1,@X[$j%16],@X[$j%16] + zapnot @X[$i%16],0xf,@X[$i%16] +___ +} + +$code=<<___; +#ifdef __linux__ +#include +#else +#include +#include +#endif + +.text + +.set noat +.set noreorder +.globl sha1_block_data_order +.align 5 +.ent sha1_block_data_order +sha1_block_data_order: + lda sp,-64(sp) + stq ra,0(sp) + stq s0,8(sp) + stq s1,16(sp) + stq s2,24(sp) + stq s3,32(sp) + stq s4,40(sp) + stq s5,48(sp) + stq fp,56(sp) + .mask 0x0400fe00,-64 + .frame sp,64,ra + .prologue 0 + + ldl $A,0($ctx) + ldl $B,4($ctx) + sll $num,6,$num + ldl $C,8($ctx) + ldl $D,12($ctx) + ldl $E,16($ctx) + addq $inp,$num,$num + +.Lloop: + .set noreorder + ldah $K,23170(zero) + zapnot $B,0xf,$B + lda $K,31129($K) # K_00_19 +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,28378(zero) + lda $K,-5215($K) # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,-28900(zero) + lda $K,-17188($K) # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + ldah $K,-13725(zero) + lda $K,-15914($K) # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stl $A,0($ctx) + stl $B,4($ctx) + addq $inp,64,$inp + stl $C,8($ctx) + stl $D,12($ctx) + stl $E,16($ctx) + cmpult $inp,$num,$t1 + bne $t1,.Lloop + + .set noreorder + ldq ra,0(sp) + ldq s0,8(sp) + ldq s1,16(sp) + ldq s2,24(sp) + ldq s3,32(sp) + ldq s4,40(sp) + ldq s5,48(sp) + ldq fp,56(sp) + lda sp,64(sp) + ret (ra) +.end sha1_block_data_order +.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by " +.align 2 +___ +$output=shift and open STDOUT,">$output"; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl index 6e65fe3e01..fe8207f77f 100644 --- a/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl +++ b/src/lib/libcrypto/sha/asm/sha1-armv4-large.pl @@ -47,6 +47,10 @@ # Cortex A8 core and in absolute terms ~870 cycles per input block # [or 13.6 cycles per byte]. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 10% +# improvement on Cortex A8 core and 12.2 cycles per byte. while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; @@ -76,31 +80,41 @@ $code.=<<___; add $e,$K,$e,ror#2 @ E+=K_xx_xx ldr $t3,[$Xi,#2*4] eor $t0,$t0,$t1 - eor $t2,$t2,$t3 + eor $t2,$t2,$t3 @ 1 cycle stall eor $t1,$c,$d @ F_xx_xx mov $t0,$t0,ror#31 add $e,$e,$a,ror#27 @ E+=ROR(A,27) eor $t0,$t0,$t2,ror#31 + str $t0,[$Xi,#-4]! $opt1 @ F_xx_xx $opt2 @ F_xx_xx add $e,$e,$t0 @ E+=X[i] - str $t0,[$Xi,#-4]! ___ } sub BODY_00_15 { my ($a,$b,$c,$d,$e)=@_; $code.=<<___; - ldrb $t0,[$inp],#4 - ldrb $t1,[$inp,#-1] - ldrb $t2,[$inp,#-2] +#if __ARM_ARCH__<7 + ldrb $t1,[$inp,#2] + ldrb $t0,[$inp,#3] + ldrb $t2,[$inp,#1] add $e,$K,$e,ror#2 @ E+=K_00_19 - ldrb $t3,[$inp,#-3] + ldrb $t3,[$inp],#4 + orr $t0,$t0,$t1,lsl#8 + eor $t1,$c,$d @ F_xx_xx + orr $t0,$t0,$t2,lsl#16 add $e,$e,$a,ror#27 @ E+=ROR(A,27) - orr $t0,$t1,$t0,lsl#24 + orr $t0,$t0,$t3,lsl#24 +#else + ldr $t0,[$inp],#4 @ handles unaligned + add $e,$K,$e,ror#2 @ E+=K_00_19 eor $t1,$c,$d @ F_xx_xx - orr $t0,$t0,$t2,lsl#8 - orr $t0,$t0,$t3,lsl#16 + add $e,$e,$a,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev $t0,$t0 @ byte swap +#endif +#endif and $t1,$b,$t1,ror#2 add $e,$e,$t0 @ E+=X[i] eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) @@ -136,6 +150,8 @@ ___ } $code=<<___; +#include "arm_arch.h" + .text .global sha1_block_data_order @@ -209,10 +225,14 @@ $code.=<<___; teq $inp,$len bne .Lloop @ [+18], total 1307 +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .align 2 .LK_00_19: .word 0x5a827999 .LK_20_39: .word 0x6ed9eba1 diff --git a/src/lib/libcrypto/sha/asm/sha1-ia64.pl b/src/lib/libcrypto/sha/asm/sha1-ia64.pl index 51c4f47ecb..db28f0805a 100644 --- a/src/lib/libcrypto/sha/asm/sha1-ia64.pl +++ b/src/lib/libcrypto/sha/asm/sha1-ia64.pl @@ -15,7 +15,7 @@ # is >50% better than HP C and >2x better than gcc. $code=<<___; -.ident \"sha1-ia64.s, version 1.2\" +.ident \"sha1-ia64.s, version 1.3\" .ident \"IA-64 ISA artwork by Andy Polyakov \" .explicit @@ -26,14 +26,10 @@ if ($^O eq "hpux") { $ADDP="addp4"; for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); } } else { $ADDP="add"; } -for (@ARGV) { $big_endian=1 if (/\-DB_ENDIAN/); - $big_endian=0 if (/\-DL_ENDIAN/); } -if (!defined($big_endian)) - { $big_endian=(unpack('L',pack('N',1))==1); } #$human=1; if ($human) { # useful for visual code auditing... - ($A,$B,$C,$D,$E,$T) = ("A","B","C","D","E","T"); + ($A,$B,$C,$D,$E) = ("A","B","C","D","E"); ($h0,$h1,$h2,$h3,$h4) = ("h0","h1","h2","h3","h4"); ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = ( "K_00_19","K_20_39","K_40_59","K_60_79" ); @@ -41,47 +37,50 @@ if ($human) { # useful for visual code auditing... "X8", "X9","X10","X11","X12","X13","X14","X15" ); } else { - ($A,$B,$C,$D,$E,$T) = ("loc0","loc1","loc2","loc3","loc4","loc5"); - ($h0,$h1,$h2,$h3,$h4) = ("loc6","loc7","loc8","loc9","loc10"); + ($A,$B,$C,$D,$E) = ("loc0","loc1","loc2","loc3","loc4"); + ($h0,$h1,$h2,$h3,$h4) = ("loc5","loc6","loc7","loc8","loc9"); ($K_00_19, $K_20_39, $K_40_59, $K_60_79) = - ( "r14", "r15", "loc11", "loc12" ); + ( "r14", "r15", "loc10", "loc11" ); @X= ( "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31" ); } sub BODY_00_15 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; $code.=<<___ if ($i==0); -{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB +{ .mmi; ld1 $X[$i]=[inp],2 // MSB ld1 tmp2=[tmp3],2 };; { .mmi; ld1 tmp0=[inp],2 ld1 tmp4=[tmp3],2 // LSB - dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };; + dep $X[$i]=$X[$i],tmp2,8,8 };; ___ if ($i<15) { $code.=<<___; -{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1 +{ .mmi; ld1 $Xn=[inp],2 // forward Xload + nop.m 0x0 dep tmp1=tmp0,tmp4,8,8 };; -{ .mmi; ld1 tmp2=[tmp3],2 // +1 +{ .mmi; ld1 tmp2=[tmp3],2 // forward Xload and tmp4=$c,$b - dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; -{ .mmi; andcm tmp1=$d,$b - add tmp0=$e,$K_00_19 + dep $X[$i]=$X[$i],tmp1,16,16} //;; +{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19 + andcm tmp1=$d,$b dep.z tmp5=$a,5,27 };; // a<<5 -{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 +{ .mmi; add $e=$e,$X[$i] // e+=Xload + or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) extr.u tmp1=$a,27,5 };; // a>>27 -{ .mmi; ld1 tmp0=[inp],2 // +1 - add $f=$f,tmp4 // f+=F_00_19(b,c,d) +{ .mmi; ld1 tmp0=[inp],2 // forward Xload + add $e=$e,tmp4 // e+=F_00_19(b,c,d) shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) -{ .mmi; ld1 tmp4=[tmp3],2 // +1 +{ .mmi; ld1 tmp4=[tmp3],2 // forward Xload or tmp5=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5) - dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1 - mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;; +{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5) + dep $Xn=$Xn,tmp2,8,8 // forward Xload + mux2 $X[$i]=$X[$i],0x44 } //;; ___ } @@ -89,24 +88,24 @@ else { $code.=<<___; { .mii; and tmp3=$c,$b dep tmp1=tmp0,tmp4,8,8;; - dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; -{ .mmi; andcm tmp1=$d,$b - add tmp0=$e,$K_00_19 + dep $X[$i]=$X[$i],tmp1,16,16} //;; +{ .mmi; add $e=$e,$K_00_19 // e+=K_00_19 + andcm tmp1=$d,$b dep.z tmp5=$a,5,27 };; // a<<5 -{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 +{ .mmi; add $e=$e,$X[$i] // e+=Xupdate + or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 +{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate nop.i 0 };; -{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d) - xor tmp2=tmp2,tmp3 // +1 +{ .mmi; add $e=$e,tmp4 // e+=F_00_19(b,c,d) + xor $Xn=$Xn,tmp3 // forward Xupdate shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) - mux2 $X[$i&0xf]=$X[$i&0xf],0x44 };; +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + mux2 $X[$i]=$X[$i],0x44 };; ___ } @@ -114,27 +113,28 @@ ___ sub BODY_16_19 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; $code.=<<___; -{ .mmi; mov $X[$i&0xf]=$f // Xupdate - and tmp0=$c,$b +{ .mib; add $e=$e,$K_00_19 // e+=K_00_19 dep.z tmp5=$a,5,27 } // a<<5 -{ .mmi; andcm tmp1=$d,$b - add tmp4=$e,$K_00_19 };; -{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=$f,tmp4 // f+=e+K_00_19 +{ .mib; andcm tmp1=$d,$b + and tmp0=$c,$b };; +{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate + or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 +{ .mmi; xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] // forward Xupdate nop.i 0 };; -{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d) - xor tmp2=tmp2,tmp3 // +1 +{ .mmi; add $e=$e,tmp0 // f+=F_00_19(b,c,d) + xor $Xn=$Xn,tmp3 // forward Xupdate shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) nop.i 0 };; ___ @@ -142,49 +142,47 @@ ___ sub BODY_20_39 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f,$Konst)=@_; +my ($i,$a,$b,$c,$d,$e,$Konst)=@_; $Konst = $K_20_39 if (!defined($Konst)); +my $j=$i+1; +my $Xn=@X[$j%16]; if ($i<79) { $code.=<<___; -{ .mib; mov $X[$i&0xf]=$f // Xupdate +{ .mib; add $e=$e,$Konst // e+=K_XX_XX dep.z tmp5=$a,5,27 } // a<<5 { .mib; xor tmp0=$c,$b - add tmp4=$e,$Konst };; -{ .mmi; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d - add $f=$f,tmp4 // f+=e+K_20_39 + xor $Xn=$Xn,$X[($j+2)%16] };; // forward Xupdate +{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 - nop.i 0 };; -{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) - xor tmp2=tmp2,tmp3 // +1 +{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d + xor $Xn=$Xn,$X[($j+8)%16] };; // forward Xupdate +{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) + xor $Xn=$Xn,$X[($j+13)%16] // forward Xupdate shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - shrp $e=tmp2,tmp2,31 // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) +{ .mii; add $e=$e,tmp1 // e+=ROTATE(a,5) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) nop.i 0 };; ___ } else { $code.=<<___; -{ .mib; mov $X[$i&0xf]=$f // Xupdate +{ .mib; add $e=$e,$Konst // e+=K_60_79 dep.z tmp5=$a,5,27 } // a<<5 { .mib; xor tmp0=$c,$b - add tmp4=$e,$Konst };; -{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d - extr.u tmp1=$a,27,5 } // a>>27 -{ .mib; add $f=$f,tmp4 // f+=e+K_20_39 add $h1=$h1,$a };; // wrap up -{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) - shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;? -{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) +{ .mib; add $e=$e,$X[$i%16] // e+=Xupdate + extr.u tmp1=$a,27,5 } // a>>27 +{ .mib; xor tmp0=tmp0,$d // F_20_39(b,c,d)=b^c^d add $h3=$h3,$c };; // wrap up -{ .mib; add tmp3=1,inp // used in unaligned codepath - add $f=$f,tmp1 } // f+=ROTATE(a,5) -{ .mib; add $h2=$h2,$b // wrap up +{ .mmi; add $e=$e,tmp0 // e+=F_20_39(b,c,d) + or tmp1=tmp1,tmp5 // ROTATE(a,5) + shrp $b=tmp6,tmp6,2 };; // b=ROTATE(b,30) ;;? +{ .mmi; add $e=$e,tmp1 // e+=ROTATE(a,5) + add tmp3=1,inp // used in unaligned codepath add $h4=$h4,$d };; // wrap up ___ @@ -193,29 +191,29 @@ ___ sub BODY_40_59 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +my $Xn=@X[$j%16]; $code.=<<___; -{ .mmi; mov $X[$i&0xf]=$f // Xupdate - and tmp0=$c,$b +{ .mib; add $e=$e,$K_40_59 // e+=K_40_59 dep.z tmp5=$a,5,27 } // a<<5 -{ .mmi; and tmp1=$d,$b - add tmp4=$e,$K_40_59 };; -{ .mmi; or tmp0=tmp0,tmp1 // (b&c)|(b&d) - add $f=$f,tmp4 // f+=e+K_40_59 +{ .mib; and tmp1=$c,$d + xor tmp0=$c,$d };; +{ .mmi; add $e=$e,$X[$i%16] // e+=Xupdate + add tmp5=tmp5,tmp1 // a<<5+(c&d) extr.u tmp1=$a,27,5 } // a>>27 -{ .mmi; and tmp4=$c,$d - xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 - xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 - };; -{ .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) - xor tmp2=tmp2,tmp3 // +1 +{ .mmi; and tmp0=tmp0,$b + xor $Xn=$Xn,$X[($j+2)%16] // forward Xupdate + xor tmp3=$X[($j+8)%16],$X[($j+13)%16] };; // forward Xupdate +{ .mmi; add $e=$e,tmp0 // e+=b&(c^d) + add tmp5=tmp5,tmp1 // ROTATE(a,5)+(c&d) shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) -{ .mmi; or tmp0=tmp0,tmp4 // F_40_59(b,c,d)=(b&c)|(b&d)|(c&d) +{ .mmi; xor $Xn=$Xn,tmp3 mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp0 // f+=F_40_59(b,c,d) - shrp $e=tmp2,tmp2,31;; // f+1=ROTATE(x[0]^x[2]^x[8]^x[13],1) - add $f=$f,tmp1 };; // f+=ROTATE(a,5) +{ .mii; add $e=$e,tmp5 // e+=ROTATE(a,5)+(c&d) + shrp $Xn=$Xn,$Xn,31 // ROTATE(x[0]^x[2]^x[8]^x[13],1) + nop.i 0x0 };; ___ } @@ -237,7 +235,7 @@ inp=r33; // in1 .align 32 sha1_block_data_order: .prologue -{ .mmi; alloc tmp1=ar.pfs,3,15,0,0 +{ .mmi; alloc tmp1=ar.pfs,3,14,0,0 $ADDP tmp0=4,ctx .save ar.lc,r3 mov r3=ar.lc } @@ -245,8 +243,8 @@ sha1_block_data_order: $ADDP inp=0,inp mov r2=pr };; tmp4=in2; -tmp5=loc13; -tmp6=loc14; +tmp5=loc12; +tmp6=loc13; .body { .mlx; ld4 $h0=[ctx],8 movl $K_00_19=0x5a827999 } @@ -273,7 +271,7 @@ tmp6=loc14; ___ -{ my $i,@V=($A,$B,$C,$D,$E,$T); +{ my $i,@V=($A,$B,$C,$D,$E); for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } @@ -281,12 +279,12 @@ ___ for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } - (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check + (($V[0] eq $A) and ($V[4] eq $E)) or die; # double-check } $code.=<<___; -{ .mmb; add $h0=$h0,$E - nop.m 0 +{ .mmb; add $h0=$h0,$A + add $h2=$h2,$C br.ctop.dptk.many .Ldtop };; .Ldend: { .mmi; add tmp0=4,ctx diff --git a/src/lib/libcrypto/sha/asm/sha1-mips.pl b/src/lib/libcrypto/sha/asm/sha1-mips.pl new file mode 100644 index 0000000000..f1a702f38f --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-mips.pl @@ -0,0 +1,354 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for MIPS. + +# Performance improvement is 30% on unaligned input. The "secret" is +# to deploy lwl/lwr pair to load unaligned input. One could have +# vectorized Xupdate on MIPSIII/IV, but the goal was to code MIPS32- +# compatible subroutine. There is room for minor optimization on +# little-endian platforms... + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +# +# +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) + { $big_endian=(unpack('L',pack('N',1))==1); } + +# offsets of the Most and Least Significant Bytes +$MSB=$big_endian?0:3; +$LSB=3&~$MSB; + +@X=map("\$$_",(8..23)); # a4-a7,s0-s11 + +$ctx=$a0; +$inp=$a1; +$num=$a2; +$A="\$1"; +$B="\$2"; +$C="\$3"; +$D="\$7"; +$E="\$24"; @V=($A,$B,$C,$D,$E); +$t0="\$25"; +$t1=$num; # $num is offloaded to stack +$t2="\$30"; # fp +$K="\$31"; # ra + +sub BODY_00_14 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if (!$big_endian); + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or $t1,$t2 + or @X[$i],$t1 +___ +$code.=<<___; + lwl @X[$j],$j*4+$MSB($inp) + sll $t0,$a,5 # $i + addu $e,$K + lwr @X[$j],$j*4+$LSB($inp) + srl $t1,$a,27 + addu $e,$t0 + xor $t0,$c,$d + addu $e,$t1 + sll $t2,$b,30 + and $t0,$b + srl $b,$b,2 + xor $t0,$d + addu $e,@X[$i] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_15_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; + +$code.=<<___ if (!$big_endian && $i==15); + srl $t0,@X[$i],24 # byte swap($i) + srl $t1,@X[$i],8 + andi $t2,@X[$i],0xFF00 + sll @X[$i],@X[$i],24 + andi $t1,0xFF00 + sll $t2,$t2,8 + or @X[$i],$t0 + or @X[$i],$t1 + or @X[$i],$t2 +___ +$code.=<<___; + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + and $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + xor $t0,$d + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + xor $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + xor $t0,$b + srl $t1,@X[$j%16],31 + addu @X[$j%16],@X[$j%16] + srl $b,$b,2 + addu $e,@X[$i%16] + or @X[$j%16],$t1 + or $b,$t2 + addu $e,$t0 +___ +$code.=<<___ if ($i==79); + lw @X[0],0($ctx) + sll $t0,$a,5 # $i + addu $e,$K + lw @X[1],4($ctx) + srl $t1,$a,27 + addu $e,$t0 + lw @X[2],8($ctx) + xor $t0,$c,$d + addu $e,$t1 + lw @X[3],12($ctx) + sll $t2,$b,30 + xor $t0,$b + lw @X[4],16($ctx) + srl $b,$b,2 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[$j%16],@X[($j+2)%16] + sll $t0,$a,5 # $i + addu $e,$K + srl $t1,$a,27 + addu $e,$t0 + xor @X[$j%16],@X[($j+8)%16] + and $t0,$c,$d + addu $e,$t1 + xor @X[$j%16],@X[($j+13)%16] + sll $t2,$b,30 + addu $e,$t0 + srl $t1,@X[$j%16],31 + xor $t0,$c,$d + addu @X[$j%16],@X[$j%16] + and $t0,$b + srl $b,$b,2 + or @X[$j%16],$t1 + addu $e,@X[$i%16] + or $b,$t2 + addu $e,$t0 +___ +} + +$FRAMESIZE=16; # large enough to accomodate NUBI saved registers +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +$code=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include +#endif + +.text + +.set noat +.set noreorder +.align 5 +.globl sha1_block_data_order +.ent sha1_block_data_order +sha1_block_data_order: + .frame $sp,$FRAMESIZE*$SZREG,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder + $PTR_SUB $sp,$FRAMESIZE*$SZREG + $REG_S $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_S $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_S $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_S $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_S $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_S $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_S $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_S $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_S $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_S $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_S $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_S $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_S $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_S $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL $num,6 + $PTR_ADD $num,$inp + $REG_S $num,0($sp) + lw $A,0($ctx) + lw $B,4($ctx) + lw $C,8($ctx) + lw $D,12($ctx) + b .Loop + lw $E,16($ctx) +.align 4 +.Loop: + .set reorder + lwl @X[0],$MSB($inp) + lui $K,0x5a82 + lwr @X[0],$LSB($inp) + ori $K,0x7999 # K_00_19 +___ +for ($i=0;$i<15;$i++) { &BODY_00_14($i,@V); unshift(@V,pop(@V)); } +for (;$i<20;$i++) { &BODY_15_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x6ed9 + ori $K,0xeba1 # K_20_39 +___ +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0x8f1b + ori $K,0xbcdc # K_40_59 +___ +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + lui $K,0xca62 + ori $K,0xc1d6 # K_60_79 +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + $PTR_ADD $inp,64 + $REG_L $num,0($sp) + + addu $A,$X[0] + addu $B,$X[1] + sw $A,0($ctx) + addu $C,$X[2] + addu $D,$X[3] + sw $B,4($ctx) + addu $E,$X[4] + sw $C,8($ctx) + sw $D,12($ctx) + sw $E,16($ctx) + .set noreorder + bne $inp,$num,.Loop + nop + + .set noreorder + $REG_L $ra,($FRAMESIZE-1)*$SZREG($sp) + $REG_L $fp,($FRAMESIZE-2)*$SZREG($sp) + $REG_L $s11,($FRAMESIZE-3)*$SZREG($sp) + $REG_L $s10,($FRAMESIZE-4)*$SZREG($sp) + $REG_L $s9,($FRAMESIZE-5)*$SZREG($sp) + $REG_L $s8,($FRAMESIZE-6)*$SZREG($sp) + $REG_L $s7,($FRAMESIZE-7)*$SZREG($sp) + $REG_L $s6,($FRAMESIZE-8)*$SZREG($sp) + $REG_L $s5,($FRAMESIZE-9)*$SZREG($sp) + $REG_L $s4,($FRAMESIZE-10)*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,($FRAMESIZE-11)*$SZREG($sp) + $REG_L $s2,($FRAMESIZE-12)*$SZREG($sp) + $REG_L $s1,($FRAMESIZE-13)*$SZREG($sp) + $REG_L $s0,($FRAMESIZE-14)*$SZREG($sp) + $REG_L $gp,($FRAMESIZE-15)*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE*$SZREG +.end sha1_block_data_order +.rdata +.asciiz "SHA1 for MIPS, CRYPTOGAMS by " +___ +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/sha/asm/sha1-parisc.pl b/src/lib/libcrypto/sha/asm/sha1-parisc.pl new file mode 100644 index 0000000000..6d7bf495b2 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha1-parisc.pl @@ -0,0 +1,259 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA1 block procedure for PA-RISC. + +# June 2009. +# +# On PA-7100LC performance is >30% better than gcc 3.2 generated code +# for aligned input and >50% better for unaligned. Compared to vendor +# compiler on PA-8600 it's almost 60% faster in 64-bit build and just +# few percent faster in 32-bit one (this for aligned input, data for +# unaligned input is not available). +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker + # [+ argument transfer] +$ctx="%r26"; # arg0 +$inp="%r25"; # arg1 +$num="%r24"; # arg2 + +$t0="%r28"; +$t1="%r29"; +$K="%r31"; + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0); + +@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23"); + +sub BODY_00_19 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<15); + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + addl @X[$i],$e,$e + and $c,$b,$t0 + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + addl $t0,$e,$e +___ +$code.=<<___ if ($i>=15); # with forward Xupdate + addl $K,$e,$e ; $i + shd $a,$a,27,$t1 + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + and $c,$b,$t0 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + andcm $d,$b,$t1 + shd $b,$b,2,$b + or $t1,$t0,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + add $t0,$e,$e + shd @X[$j%16],@X[$j%16],31,@X[$j%16] +___ +} + +sub BODY_20_39 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___ if ($i<79); + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t0,$e,$e +___ +$code.=<<___ if ($i==79); # with context load + ldw 0($ctx),@X[0] ; $i + addl $K,$e,$e + shd $a,$a,27,$t1 + ldw 4($ctx),@X[1] + addl @X[$i%16],$e,$e + xor $b,$c,$t0 + ldw 8($ctx),@X[2] + addl $t1,$e,$e + shd $b,$b,2,$b + xor $d,$t0,$t0 + ldw 12($ctx),@X[3] + addl $t0,$e,$e + ldw 16($ctx),@X[4] +___ +} + +sub BODY_40_59 { +my ($i,$a,$b,$c,$d,$e)=@_; +my $j=$i+1; +$code.=<<___; + shd $a,$a,27,$t1 ; $i + addl $K,$e,$e + xor @X[($j+2)%16],@X[$j%16],@X[$j%16] + xor $d,$c,$t0 + addl @X[$i%16],$e,$e + xor @X[($j+8)%16],@X[$j%16],@X[$j%16] + and $b,$t0,$t0 + addl $t1,$e,$e + shd $b,$b,2,$b + xor @X[($j+13)%16],@X[$j%16],@X[$j%16] + addl $t0,$e,$e + and $d,$c,$t1 + shd @X[$j%16],@X[$j%16],31,@X[$j%16] + addl $t1,$e,$e +___ +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR +sha1_block_data_order + .PROC + .CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + + ldw 0($ctx),$A + ldw 4($ctx),$B + ldw 8($ctx),$C + ldw 12($ctx),$D + ldw 16($ctx),$E + + extru $inp,31,2,$t0 ; t0=inp&3; + sh3addl $t0,%r0,$t0 ; t0*=8; + subi 32,$t0,$t0 ; t0=32-t0; + mtctl $t0,%cr11 ; %sar=t0; + +L\$oop + ldi 3,$t0 + andcm $inp,$t0,$t0 ; 64-bit neutral +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\tldw `4*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + ldw 60($t0),@X[15] + ldw 64($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align input + $code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + ldil L'0x5a827000,$K ; K_00_19 + ldo 0x999($K),$K +___ +for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x6ed9e000,$K ; K_20_39 + ldo 0xba1($K),$K +___ + +for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0x8f1bb000,$K ; K_40_59 + ldo 0xcdc($K),$K +___ + +for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + ldil L'0xca62c000,$K ; K_60_79 + ldo 0x1d6($K),$K +___ +for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + +$code.=<<___; + addl @X[0],$A,$A + addl @X[1],$B,$B + addl @X[2],$C,$C + addl @X[3],$D,$D + addl @X[4],$E,$E + stw $A,0($ctx) + stw $B,4($ctx) + stw $C,8($ctx) + stw $D,12($ctx) + stw $E,16($ctx) + addib,*<> -1,$num,L\$oop + ldo 64($inp),$inp + + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by " +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +$code =~ s/,\*/,/gm if ($SIZE_T==4); +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/sha/asm/sha1-ppc.pl b/src/lib/libcrypto/sha/asm/sha1-ppc.pl index dcd0fcdfcf..2140dd2f8d 100755 --- a/src/lib/libcrypto/sha/asm/sha1-ppc.pl +++ b/src/lib/libcrypto/sha/asm/sha1-ppc.pl @@ -24,12 +24,14 @@ $flavour = shift; if ($flavour =~ /64/) { $SIZE_T =8; + $LRSAVE =2*$SIZE_T; $UCMP ="cmpld"; $STU ="stdu"; $POP ="ld"; $PUSH ="std"; } elsif ($flavour =~ /32/) { $SIZE_T =4; + $LRSAVE =$SIZE_T; $UCMP ="cmplw"; $STU ="stwu"; $POP ="lwz"; @@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl"; open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!"; -$FRAME=24*$SIZE_T; +$FRAME=24*$SIZE_T+64; +$LOCALS=6*$SIZE_T; $K ="r0"; $sp ="r1"; @@ -162,9 +165,8 @@ $code=<<___; .globl .sha1_block_data_order .align 4 .sha1_block_data_order: + $STU $sp,-$FRAME($sp) mflr r0 - $STU $sp,`-($FRAME+64)`($sp) - $PUSH r0,`$FRAME-$SIZE_T*18`($sp) $PUSH r15,`$FRAME-$SIZE_T*17`($sp) $PUSH r16,`$FRAME-$SIZE_T*16`($sp) $PUSH r17,`$FRAME-$SIZE_T*15`($sp) @@ -182,6 +184,7 @@ $code=<<___; $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) lwz $A,0($ctx) lwz $B,4($ctx) lwz $C,8($ctx) @@ -192,37 +195,14 @@ $code=<<___; Laligned: mtctr $num bl Lsha1_block_private -Ldone: - $POP r0,`$FRAME-$SIZE_T*18`($sp) - $POP r15,`$FRAME-$SIZE_T*17`($sp) - $POP r16,`$FRAME-$SIZE_T*16`($sp) - $POP r17,`$FRAME-$SIZE_T*15`($sp) - $POP r18,`$FRAME-$SIZE_T*14`($sp) - $POP r19,`$FRAME-$SIZE_T*13`($sp) - $POP r20,`$FRAME-$SIZE_T*12`($sp) - $POP r21,`$FRAME-$SIZE_T*11`($sp) - $POP r22,`$FRAME-$SIZE_T*10`($sp) - $POP r23,`$FRAME-$SIZE_T*9`($sp) - $POP r24,`$FRAME-$SIZE_T*8`($sp) - $POP r25,`$FRAME-$SIZE_T*7`($sp) - $POP r26,`$FRAME-$SIZE_T*6`($sp) - $POP r27,`$FRAME-$SIZE_T*5`($sp) - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - mtlr r0 - addi $sp,$sp,`$FRAME+64` - blr -___ + b Ldone -# PowerPC specification allows an implementation to be ill-behaved -# upon unaligned access which crosses page boundary. "Better safe -# than sorry" principle makes me treat it specially. But I don't -# look for particular offending word, but rather for 64-byte input -# block which crosses the boundary. Once found that block is aligned -# and hashed separately... -$code.=<<___; +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for 64-byte input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... .align 4 Lunaligned: subfic $t1,$inp,4096 @@ -237,7 +217,7 @@ Lunaligned: Lcross_page: li $t1,16 mtctr $t1 - addi r20,$sp,$FRAME ; spot below the frame + addi r20,$sp,$LOCALS ; spot within the frame Lmemcpy: lbz r16,0($inp) lbz r17,1($inp) @@ -251,15 +231,40 @@ Lmemcpy: addi r20,r20,4 bdnz Lmemcpy - $PUSH $inp,`$FRAME-$SIZE_T*19`($sp) + $PUSH $inp,`$FRAME-$SIZE_T*18`($sp) li $t1,1 - addi $inp,$sp,$FRAME + addi $inp,$sp,$LOCALS mtctr $t1 bl Lsha1_block_private - $POP $inp,`$FRAME-$SIZE_T*19`($sp) + $POP $inp,`$FRAME-$SIZE_T*18`($sp) addic. $num,$num,-1 bne- Lunaligned - b Ldone + +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 ___ # This is private block function, which uses tailored calling @@ -309,6 +314,8 @@ $code.=<<___; addi $inp,$inp,`16*4` bdnz- Lsha1_block_private blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ $code.=<<___; .asciz "SHA1 block transform for PPC, CRYPTOGAMS by " diff --git a/src/lib/libcrypto/sha/asm/sha1-s390x.pl b/src/lib/libcrypto/sha/asm/sha1-s390x.pl index 4b17848287..9193dda45e 100644 --- a/src/lib/libcrypto/sha/asm/sha1-s390x.pl +++ b/src/lib/libcrypto/sha/asm/sha1-s390x.pl @@ -21,9 +21,28 @@ # instructions to favour dual-issue z10 pipeline. On z10 hardware is # "only" ~2.3x faster than software. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. + $kimdfunc=1; # magic function code for kimd instruction -$output=shift; +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $K_00_39="%r0"; $K=$K_00_39; @@ -42,13 +61,14 @@ $t1="%r11"; @X=("%r12","%r13","%r14"); $sp="%r15"; -$frame=160+16*4; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*4; sub Xupdate { my $i=shift; $code.=<<___ if ($i==15); - lg $prefetch,160($sp) ### Xupdate(16) warm-up + lg $prefetch,$stdframe($sp) ### Xupdate(16) warm-up lr $X[0],$X[2] ___ return if ($i&1); # Xupdate is vectorized and executed every 2nd cycle @@ -58,8 +78,8 @@ $code.=<<___ if ($i<16); ___ $code.=<<___ if ($i>=16); xgr $X[0],$prefetch ### Xupdate($i) - lg $prefetch,`160+4*(($i+2)%16)`($sp) - xg $X[0],`160+4*(($i+8)%16)`($sp) + lg $prefetch,`$stdframe+4*(($i+2)%16)`($sp) + xg $X[0],`$stdframe+4*(($i+8)%16)`($sp) xgr $X[0],$prefetch rll $X[0],$X[0],1 rllg $X[1],$X[0],32 @@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16); lr $X[2],$X[1] # feedback ___ $code.=<<___ if ($i<=70); - stg $X[0],`160+4*($i%16)`($sp) + stg $X[0],`$stdframe+4*($i%16)`($sp) ___ unshift(@X,pop(@X)); } @@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc); tmhl %r0,0x4000 # check for message-security assist jz .Lsoftware lghi %r0,0 - la %r1,16($sp) + la %r1,`2*$SIZE_T`($sp) .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + lg %r0,`2*$SIZE_T`($sp) tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc @@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc); ___ $code.=<<___; lghi %r1,-$frame - stg $ctx,16($sp) - stmg %r6,%r15,48($sp) + st${g} $ctx,`2*$SIZE_T`($sp) + stm${g} %r6,%r15,`6*$SIZE_T`($sp) lgr %r0,$sp la $sp,0(%r1,$sp) - stg %r0,0($sp) + st${g} %r0,0($sp) larl $t0,Ktable llgf $A,0($ctx) @@ -199,7 +219,7 @@ ___ for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; - lg $ctx,`$frame+16`($sp) + l${g} $ctx,`$frame+2*$SIZE_T`($sp) la $inp,64($inp) al $A,0($ctx) al $B,4($ctx) @@ -211,13 +231,13 @@ $code.=<<___; st $C,8($ctx) st $D,12($ctx) st $E,16($ctx) - brct $len,.Lloop + brct${g} $len,.Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size sha1_block_data_order,.-sha1_block_data_order .string "SHA1 block transform for s390x, CRYPTOGAMS by " -.comm OPENSSL_s390xcap_P,8,8 +.comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl index 4edc5ea9ad..f27c1e3fb0 100755 --- a/src/lib/libcrypto/sha/asm/sha1-x86_64.pl +++ b/src/lib/libcrypto/sha/asm/sha1-x86_64.pl @@ -16,7 +16,7 @@ # There was suggestion to mechanically translate 32-bit code, but I # dismissed it, reasoning that x86_64 offers enough register bank # capacity to fully utilize SHA-1 parallelism. Therefore this fresh -# implementation:-) However! While 64-bit code does performs better +# implementation:-) However! While 64-bit code does perform better # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, # x86_64 does offer larger *addressable* bank, but out-of-order core # reaches for even more registers through dynamic aliasing, and EM64T @@ -29,6 +29,38 @@ # Xeon P4 +65% +0% 9.9 # Core2 +60% +10% 7.0 +# August 2009. +# +# The code was revised to minimize code size and to maximize +# "distance" between instructions producing input to 'lea' +# instruction and the 'lea' instruction itself, which is essential +# for Intel Atom core. + +# October 2010. +# +# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it +# is to offload message schedule denoted by Wt in NIST specification, +# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module +# for background and implementation details. The only difference from +# 32-bit code is that 64-bit code doesn't have to spill @X[] elements +# to free temporary registers. + +# April 2011. +# +# Add AVX code path. See sha1-586.pl for further information. + +###################################################################### +# Current performance is summarized in following table. Numbers are +# CPU clock cycles spent to process single byte (less is better). +# +# x86_64 SSSE3 AVX +# P4 9.8 - +# Opteron 6.6 - +# Core2 6.7 6.1/+10% - +# Atom 11.0 9.7/+13% - +# Westmere 7.1 5.6/+27% - +# Sandy Bridge 7.9 6.3/+25% 5.2/+51% + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -40,6 +72,16 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; +$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/ && + $1>=2.19); +$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && + $1>=2.09); +$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./ && + $1>=10); + open STDOUT,"| $^X $xlate $flavour $output"; $ctx="%rdi"; # 1st arg @@ -51,196 +93,994 @@ $ctx="%r8"; $inp="%r9"; $num="%r10"; -$xi="%eax"; -$t0="%ebx"; -$t1="%ecx"; -$A="%edx"; -$B="%esi"; -$C="%edi"; -$D="%ebp"; -$E="%r11d"; -$T="%r12d"; - -@V=($A,$B,$C,$D,$E,$T); +$t0="%eax"; +$t1="%ebx"; +$t2="%ecx"; +@xi=("%edx","%ebp"); +$A="%esi"; +$B="%edi"; +$C="%r11d"; +$D="%r12d"; +$E="%r13d"; -sub PROLOGUE { -my $func=shift; -$code.=<<___; -.globl $func -.type $func,\@function,3 -.align 16 -$func: - push %rbx - push %rbp - push %r12 - mov %rsp,%r11 - mov %rdi,$ctx # reassigned argument - sub \$`8+16*4`,%rsp - mov %rsi,$inp # reassigned argument - and \$-64,%rsp - mov %rdx,$num # reassigned argument - mov %r11,`16*4`(%rsp) -.Lprologue: - - mov 0($ctx),$A - mov 4($ctx),$B - mov 8($ctx),$C - mov 12($ctx),$D - mov 16($ctx),$E -___ -} - -sub EPILOGUE { -my $func=shift; -$code.=<<___; - mov `16*4`(%rsp),%rsi - mov (%rsi),%r12 - mov 8(%rsi),%rbp - mov 16(%rsi),%rbx - lea 24(%rsi),%rsp -.Lepilogue: - ret -.size $func,.-$func -___ -} +@V=($A,$B,$C,$D,$E); sub BODY_00_19 { -my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i==0); - mov `4*$i`($inp),$xi - `"bswap $xi" if(!defined($host))` - mov $xi,`4*$i`(%rsp) + mov `4*$i`($inp),$xi[0] + bswap $xi[0] + mov $xi[0],`4*$i`(%rsp) ___ $code.=<<___ if ($i<15); - lea 0x5a827999($xi,$e),$f mov $c,$t0 - mov `4*$j`($inp),$xi - mov $a,$e + mov `4*$j`($inp),$xi[1] + mov $a,$t2 xor $d,$t0 - `"bswap $xi" if(!defined($host))` - rol \$5,$e + bswap $xi[1] + rol \$5,$t2 + lea 0x5a827999($xi[0],$e),$e and $b,$t0 - mov $xi,`4*$j`(%rsp) - add $e,$f + mov $xi[1],`4*$j`(%rsp) + add $t2,$e xor $d,$t0 rol \$30,$b - add $t0,$f + add $t0,$e ___ $code.=<<___ if ($i>=15); - lea 0x5a827999($xi,$e),$f - mov `4*($j%16)`(%rsp),$xi + mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 - mov $a,$e - xor `4*(($j+2)%16)`(%rsp),$xi + mov $a,$t2 + xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 - rol \$5,$e - xor `4*(($j+8)%16)`(%rsp),$xi + rol \$5,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 - add $e,$f - xor `4*(($j+13)%16)`(%rsp),$xi + lea 0x5a827999($xi[0],$e),$e + xor `4*(($j+13)%16)`(%rsp),$xi[1] xor $d,$t0 + rol \$1,$xi[1] + add $t2,$e rol \$30,$b - add $t0,$f - rol \$1,$xi - mov $xi,`4*($j%16)`(%rsp) + mov $xi[1],`4*($j%16)`(%rsp) + add $t0,$e ___ +unshift(@xi,pop(@xi)); } sub BODY_20_39 { -my ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); - lea $K($xi,$e),$f - mov `4*($j%16)`(%rsp),$xi + mov `4*($j%16)`(%rsp),$xi[1] mov $c,$t0 - mov $a,$e - xor `4*(($j+2)%16)`(%rsp),$xi + mov $a,$t2 + xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $b,$t0 - rol \$5,$e - xor `4*(($j+8)%16)`(%rsp),$xi + rol \$5,$t2 + lea $K($xi[0],$e),$e + xor `4*(($j+8)%16)`(%rsp),$xi[1] xor $d,$t0 - add $e,$f - xor `4*(($j+13)%16)`(%rsp),$xi + add $t2,$e + xor `4*(($j+13)%16)`(%rsp),$xi[1] rol \$30,$b - add $t0,$f - rol \$1,$xi + add $t0,$e + rol \$1,$xi[1] ___ $code.=<<___ if ($i<76); - mov $xi,`4*($j%16)`(%rsp) + mov $xi[1],`4*($j%16)`(%rsp) ___ $code.=<<___ if ($i==79); - lea $K($xi,$e),$f mov $c,$t0 - mov $a,$e + mov $a,$t2 xor $b,$t0 - rol \$5,$e + lea $K($xi[0],$e),$e + rol \$5,$t2 xor $d,$t0 - add $e,$f + add $t2,$e rol \$30,$b - add $t0,$f + add $t0,$e ___ +unshift(@xi,pop(@xi)); } sub BODY_40_59 { -my ($i,$a,$b,$c,$d,$e,$f)=@_; +my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; - lea 0x8f1bbcdc($xi,$e),$f - mov `4*($j%16)`(%rsp),$xi - mov $b,$t0 - mov $b,$t1 - xor `4*(($j+2)%16)`(%rsp),$xi - mov $a,$e - and $c,$t0 - xor `4*(($j+8)%16)`(%rsp),$xi - or $c,$t1 - rol \$5,$e - xor `4*(($j+13)%16)`(%rsp),$xi - and $d,$t1 - add $e,$f - rol \$1,$xi - or $t1,$t0 + mov `4*($j%16)`(%rsp),$xi[1] + mov $c,$t0 + mov $c,$t1 + xor `4*(($j+2)%16)`(%rsp),$xi[1] + and $d,$t0 + mov $a,$t2 + xor `4*(($j+8)%16)`(%rsp),$xi[1] + xor $d,$t1 + lea 0x8f1bbcdc($xi[0],$e),$e + rol \$5,$t2 + xor `4*(($j+13)%16)`(%rsp),$xi[1] + add $t0,$e + and $b,$t1 + rol \$1,$xi[1] + add $t1,$e rol \$30,$b - mov $xi,`4*($j%16)`(%rsp) - add $t0,$f + mov $xi[1],`4*($j%16)`(%rsp) + add $t2,$e ___ +unshift(@xi,pop(@xi)); } -$code=".text\n"; +$code.=<<___; +.text +.extern OPENSSL_ia32cap_P -&PROLOGUE("sha1_block_data_order"); -$code.=".align 4\n.Lloop:\n"; +.globl sha1_block_data_order +.type sha1_block_data_order,\@function,3 +.align 16 +sha1_block_data_order: + mov OPENSSL_ia32cap_P+0(%rip),%r9d + mov OPENSSL_ia32cap_P+4(%rip),%r8d + test \$`1<<9`,%r8d # check SSSE3 bit + jz .Lialu +___ +$code.=<<___ if ($avx); + and \$`1<<28`,%r8d # mask AVX bit + and \$`1<<30`,%r9d # mask "Intel CPU" bit + or %r9d,%r8d + cmp \$`1<<28|1<<30`,%r8d + je _avx_shortcut +___ +$code.=<<___; + jmp _ssse3_shortcut + +.align 16 +.Lialu: + push %rbx + push %rbp + push %r12 + push %r13 + mov %rsp,%r11 + mov %rdi,$ctx # reassigned argument + sub \$`8+16*4`,%rsp + mov %rsi,$inp # reassigned argument + and \$-64,%rsp + mov %rdx,$num # reassigned argument + mov %r11,`16*4`(%rsp) +.Lprologue: + + mov 0($ctx),$A + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov 16($ctx),$E + jmp .Lloop + +.align 16 +.Lloop: +___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; - add 0($ctx),$E - add 4($ctx),$T - add 8($ctx),$A - add 12($ctx),$B - add 16($ctx),$C - mov $E,0($ctx) - mov $T,4($ctx) - mov $A,8($ctx) - mov $B,12($ctx) - mov $C,16($ctx) - - xchg $E,$A # mov $E,$A - xchg $T,$B # mov $T,$B - xchg $E,$C # mov $A,$C - xchg $T,$D # mov $B,$D - # mov $C,$E - lea `16*4`($inp),$inp + add 0($ctx),$A + add 4($ctx),$B + add 8($ctx),$C + add 12($ctx),$D + add 16($ctx),$E + mov $A,0($ctx) + mov $B,4($ctx) + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + sub \$1,$num + lea `16*4`($inp),$inp jnz .Lloop + + mov `16*4`(%rsp),%rsi + mov (%rsi),%r13 + mov 8(%rsi),%r12 + mov 16(%rsi),%rbp + mov 24(%rsi),%rbx + lea 32(%rsi),%rsp +.Lepilogue: + ret +.size sha1_block_data_order,.-sha1_block_data_order ___ -&EPILOGUE("sha1_block_data_order"); +{{{ +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; +my $K_XX_XX="%r11"; + +my $_rol=sub { &rol(@_) }; +my $_ror=sub { &ror(@_) }; + +$code.=<<___; +.type sha1_block_data_order_ssse3,\@function,3 +.align 16 +sha1_block_data_order_ssse3: +_ssse3_shortcut: + push %rbx + push %rbp + push %r12 + lea `-64-($win64?5*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,64+0(%rsp) + movaps %xmm7,64+16(%rsp) + movaps %xmm8,64+32(%rsp) + movaps %xmm9,64+48(%rsp) + movaps %xmm10,64+64(%rsp) +.Lprologue_ssse3: +___ +$code.=<<___; + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + + shl \$6,$num + add $inp,$num + lea K_XX_XX(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + movdqa 64($K_XX_XX),@X[2] # pbswap mask + movdqa 0($K_XX_XX),@Tx[1] # K_00_19 + movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + movdqu 16($inp),@X[-3&7] + movdqu 32($inp),@X[-2&7] + movdqu 48($inp),@X[-1&7] + pshufb @X[2],@X[-4&7] # byte swap + add \$64,$inp + pshufb @X[2],@X[-3&7] + pshufb @X[2],@X[-2&7] + pshufb @X[2],@X[-1&7] + paddd @Tx[1],@X[-4&7] # add K_00_19 + paddd @Tx[1],@X[-3&7] + paddd @Tx[1],@X[-2&7] + movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU + psubd @Tx[1],@X[-4&7] # restore X[] + movdqa @X[-3&7],16(%rsp) + psubd @Tx[1],@X[-3&7] + movdqa @X[-2&7],32(%rsp) + psubd @Tx[1],@X[-2&7] + jmp .Loop_ssse3 +___ + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@X[0],@X[-3&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[0],@X[-1&7]); + &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq (@Tx[0],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (@Tx[2],@X[0]); + &movdqa (@Tx[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword + &paddd (@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[1],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + + &psrld (@Tx[2],30); + &por (@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &pslld (@Tx[1],2); + &pxor (@X[0],@Tx[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_ssse3_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); + eval(shift(@insns)); # body_20_39 + &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &movdqa (@Tx[0],@X[0]); + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &pslld (@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &psrld (@Tx[0],30); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &por (@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &movdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_ssse3_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &paddd (@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$num); + &je (".Ldone_ssse3"); + + unshift(@Tx,pop(@Tx)); + + &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask + &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 + &movdqu (@X[-4&7],"0($inp)"); # load input + &movdqu (@X[-3&7],"16($inp)"); + &movdqu (@X[-2&7],"32($inp)"); + &movdqu (@X[-1&7],"48($inp)"); + &pshufb (@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &pshufb (@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &paddd (@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + &psubd (@X[($Xi-4)&7],@Tx[1]); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_ssse3() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +sub body_00_19 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer + '&xor ($c,$d);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&xor ($c,$d);', # restore $c + '&xor (@T[0],$d);', + '&add ($e,$a);', + '&$_ror ($b,$j?7:2);', # $b>>>2 + '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_20_39 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&xor (@T[0],$d);', # ($b^$d) + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&xor (@T[0],$c);', # ($b^$d^$c) + '&add ($e,$a);', + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} + +sub body_40_59 () { + ( + '($a,$b,$c,$d,$e)=@V;'. + '&mov (@T[1],$c);', + '&xor ($c,$d);', + '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer + '&and (@T[1],$d);', + '&and (@T[0],$c);', # ($b&($c^$d)) + '&$_ror ($b,7);', # $b>>>2 + '&add ($e,@T[1]);', + '&mov (@T[1],$a);', # $b in next round + '&$_rol ($a,5);', + '&add ($e,@T[0]);', + '&xor ($c,$d);', # restore $c + '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' + ); +} $code.=<<___; -.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " .align 16 +.Loop_ssse3: +___ + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_16_31(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_00_19); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_40_59); + &Xupdate_ssse3_32_79(\&body_20_39); + &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + &Xloop_ssse3(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: +___ + $j=$saved_j; @V=@saved_V; + + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + &Xtail_ssse3(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps 64+0(%rsp),%xmm6 + movaps 64+16(%rsp),%xmm7 + movaps 64+32(%rsp),%xmm8 + movaps 64+48(%rsp),%xmm9 + movaps 64+64(%rsp),%xmm10 +___ +$code.=<<___; + lea `64+($win64?5*16:0)`(%rsp),%rsi + mov 0(%rsi),%r12 + mov 8(%rsi),%rbp + mov 16(%rsi),%rbx + lea 24(%rsi),%rsp +.Lepilogue_ssse3: + ret +.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +___ + +if ($avx) { +my $Xi=4; +my @X=map("%xmm$_",(4..7,0..3)); +my @Tx=map("%xmm$_",(8..10)); +my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization +my @T=("%esi","%edi"); +my $j=0; +my $K_XX_XX="%r11"; + +my $_rol=sub { &shld(@_[0],@_) }; +my $_ror=sub { &shrd(@_[0],@_) }; + +$code.=<<___; +.type sha1_block_data_order_avx,\@function,3 +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: + push %rbx + push %rbp + push %r12 + lea `-64-($win64?5*16:0)`(%rsp),%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,64+0(%rsp) + movaps %xmm7,64+16(%rsp) + movaps %xmm8,64+32(%rsp) + movaps %xmm9,64+48(%rsp) + movaps %xmm10,64+64(%rsp) +.Lprologue_avx: +___ +$code.=<<___; + mov %rdi,$ctx # reassigned argument + mov %rsi,$inp # reassigned argument + mov %rdx,$num # reassigned argument + vzeroall + + shl \$6,$num + add $inp,$num + lea K_XX_XX(%rip),$K_XX_XX + + mov 0($ctx),$A # load context + mov 4($ctx),$B + mov 8($ctx),$C + mov 12($ctx),$D + mov $B,@T[0] # magic seed + mov 16($ctx),$E + + vmovdqa 64($K_XX_XX),@X[2] # pbswap mask + vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 + vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] + vmovdqu 16($inp),@X[-3&7] + vmovdqu 32($inp),@X[-2&7] + vmovdqu 48($inp),@X[-1&7] + vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap + add \$64,$inp + vpshufb @X[2],@X[-3&7],@X[-3&7] + vpshufb @X[2],@X[-2&7],@X[-2&7] + vpshufb @X[2],@X[-1&7],@X[-1&7] + vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 + vpaddd @Tx[1],@X[-3&7],@X[1] + vpaddd @Tx[1],@X[-2&7],@X[2] + vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU + vmovdqa @X[1],16(%rsp) + vmovdqa @X[2],32(%rsp) + jmp .Loop_avx +___ + +sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 40 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords + eval(shift(@insns)); + eval(shift(@insns)); + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[0],@X[0],31); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword + &vpaddd (@X[0],@X[0],@X[0]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpsrld (@Tx[1],@Tx[2],30); + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpslld (@Tx[2],@Tx[2],2); + &vpxor (@X[0],@X[0],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX + eval(shift(@insns)); + eval(shift(@insns)); + + + foreach (@insns) { eval; } # remaining instructions [if any] + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xupdate_avx_32_79() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions + my ($a,$b,$c,$d,$e); + + &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" + &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" + eval(shift(@insns)); + eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); + if ($Xi%5) { + &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... + } else { # ... or load next one + &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); + } + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + + &vpsrld (@Tx[0],@X[0],30); + &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpslld (@X[0],@X[0],2); + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # ror + eval(shift(@insns)); + + &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 + eval(shift(@insns)); # body_20_39 + eval(shift(@insns)); + &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); # rol + eval(shift(@insns)); + + foreach (@insns) { eval; } # remaining instructions + + $Xi++; push(@X,shift(@X)); # "rotate" X[] + push(@Tx,shift(@Tx)); +} + +sub Xuplast_avx_80() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + + &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU + + foreach (@insns) { eval; } # remaining instructions + + &cmp ($inp,$num); + &je (".Ldone_avx"); + + unshift(@Tx,pop(@Tx)); + + &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask + &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 + &vmovdqu(@X[-4&7],"0($inp)"); # load input + &vmovdqu(@X[-3&7],"16($inp)"); + &vmovdqu(@X[-2&7],"32($inp)"); + &vmovdqu(@X[-1&7],"48($inp)"); + &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap + &add ($inp,64); + + $Xi=0; +} + +sub Xloop_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + eval(shift(@insns)); + eval(shift(@insns)); + &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); + eval(shift(@insns)); + eval(shift(@insns)); + &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU + eval(shift(@insns)); + eval(shift(@insns)); + + foreach (@insns) { eval; } + $Xi++; +} + +sub Xtail_avx() +{ use integer; + my $body = shift; + my @insns = (&$body,&$body,&$body,&$body); # 32 instructions + my ($a,$b,$c,$d,$e); + + foreach (@insns) { eval; } +} + +$code.=<<___; +.align 16 +.Loop_avx: +___ + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_16_31(\&body_00_19); + &Xupdate_avx_32_79(\&body_00_19); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_20_39); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_40_59); + &Xupdate_avx_32_79(\&body_20_39); + &Xuplast_avx_80(\&body_20_39); # can jump to "done" + + $saved_j=$j; @saved_V=@V; + + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + &Xloop_avx(\&body_20_39); + +$code.=<<___; + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + add 12($ctx),$D + mov $A,0($ctx) + add 16($ctx),$E + mov @T[0],4($ctx) + mov @T[0],$B # magic seed + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) + jmp .Loop_avx + +.align 16 +.Ldone_avx: +___ + $j=$saved_j; @V=@saved_V; + + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + &Xtail_avx(\&body_20_39); + +$code.=<<___; + vzeroall + + add 0($ctx),$A # update context + add 4($ctx),@T[0] + add 8($ctx),$C + mov $A,0($ctx) + add 12($ctx),$D + mov @T[0],4($ctx) + add 16($ctx),$E + mov $C,8($ctx) + mov $D,12($ctx) + mov $E,16($ctx) +___ +$code.=<<___ if ($win64); + movaps 64+0(%rsp),%xmm6 + movaps 64+16(%rsp),%xmm7 + movaps 64+32(%rsp),%xmm8 + movaps 64+48(%rsp),%xmm9 + movaps 64+64(%rsp),%xmm10 +___ +$code.=<<___; + lea `64+($win64?5*16:0)`(%rsp),%rsi + mov 0(%rsi),%r12 + mov 8(%rsi),%rbp + mov 16(%rsi),%rbx + lea 24(%rsi),%rsp +.Lepilogue_avx: + ret +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +___ +} +$code.=<<___; +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask +___ +}}} +$code.=<<___; +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " +.align 64 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, @@ -272,25 +1112,75 @@ se_handler: lea .Lprologue(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue - jb .Lin_prologue + jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lepilogue(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue - jae .Lin_prologue + jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer - lea 24(%rax),%rax + lea 32(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 + mov -32(%rax),%r13 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler -.Lin_prologue: +.type ssse3_handler,\@abi-omnipotent +.align 16 +ssse3_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + lea 64(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$10,%ecx + .long 0xa548f3fc # cld; rep movsq + lea `24+64+5*16`(%rax),%rax # adjust stack pointer + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore cotnext->R12 + +.Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp @@ -328,19 +1218,38 @@ se_handler: pop %rdi pop %rsi ret -.size se_handler,.-se_handler +.size ssse3_handler,.-ssse3_handler .section .pdata .align 4 .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order - + .rva .LSEH_begin_sha1_block_data_order_ssse3 + .rva .LSEH_end_sha1_block_data_order_ssse3 + .rva .LSEH_info_sha1_block_data_order_ssse3 +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_sha1_block_data_order_avx + .rva .LSEH_end_sha1_block_data_order_avx + .rva .LSEH_info_sha1_block_data_order_avx +___ +$code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler +.LSEH_info_sha1_block_data_order_ssse3: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] +___ +$code.=<<___ if ($avx); +.LSEH_info_sha1_block_data_order_avx: + .byte 9,0,0,0 + .rva ssse3_handler + .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ } diff --git a/src/lib/libcrypto/sha/asm/sha256-586.pl b/src/lib/libcrypto/sha/asm/sha256-586.pl index ecc8b69c75..928ec53123 100644 --- a/src/lib/libcrypto/sha/asm/sha256-586.pl +++ b/src/lib/libcrypto/sha/asm/sha256-586.pl @@ -14,8 +14,8 @@ # Pentium PIII P4 AMD K8 Core2 # gcc 46 36 41 27 26 # icc 57 33 38 25 23 -# x86 asm 40 30 35 20 20 -# x86_64 asm(*) - - 21 15.8 16.5 +# x86 asm 40 30 33 20 18 +# x86_64 asm(*) - - 21 16 16 # # (*) x86_64 assembler performance is presented for reference # purposes. @@ -48,20 +48,19 @@ sub BODY_00_15() { my $in_16_63=shift; &mov ("ecx",$E); - &add ($T,&DWP(4*(8+15+16-9),"esp")) if ($in_16_63); # T += X[-7] - &ror ("ecx",6); - &mov ("edi",$E); - &ror ("edi",11); + &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) + &ror ("ecx",25-11); &mov ("esi",$Foff); - &xor ("ecx","edi"); - &ror ("edi",25-11); + &xor ("ecx",$E); + &ror ("ecx",11-6); &mov (&DWP(4*(8+15),"esp"),$T) if ($in_16_63); # save X[0] - &xor ("ecx","edi"); # Sigma1(e) + &xor ("ecx",$E); + &ror ("ecx",6); # Sigma1(e) &mov ("edi",$Goff); &add ($T,"ecx"); # T += Sigma1(e) - &mov ($Eoff,$E); # modulo-scheduled &xor ("esi","edi"); + &mov ($Eoff,$E); # modulo-scheduled &mov ("ecx",$A); &and ("esi",$E); &mov ($E,$Doff); # e becomes d, which is e in next iteration @@ -69,14 +68,14 @@ sub BODY_00_15() { &mov ("edi",$A); &add ($T,"esi"); # T += Ch(e,f,g) - &ror ("ecx",2); + &ror ("ecx",22-13); &add ($T,$Hoff); # T += h - &ror ("edi",13); + &xor ("ecx",$A); + &ror ("ecx",13-2); &mov ("esi",$Boff); - &xor ("ecx","edi"); - &ror ("edi",22-13); + &xor ("ecx",$A); + &ror ("ecx",2); # Sigma0(a) &add ($E,$T); # d += T - &xor ("ecx","edi"); # Sigma0(a) &mov ("edi",$Coff); &add ($T,"ecx"); # T += Sigma0(a) @@ -168,23 +167,22 @@ sub BODY_00_15() { &set_label("16_63",16); &mov ("esi",$T); &mov ("ecx",&DWP(4*(8+15+16-14),"esp")); - &shr ($T,3); - &ror ("esi",7); - &xor ($T,"esi"); &ror ("esi",18-7); &mov ("edi","ecx"); - &xor ($T,"esi"); # T = sigma0(X[-15]) + &xor ("esi",$T); + &ror ("esi",7); + &shr ($T,3); - &shr ("ecx",10); - &mov ("esi",&DWP(4*(8+15+16),"esp")); - &ror ("edi",17); - &xor ("ecx","edi"); &ror ("edi",19-17); - &add ($T,"esi"); # T += X[-16] - &xor ("edi","ecx") # sigma1(X[-2]) + &xor ($T,"esi"); # T = sigma0(X[-15]) + &xor ("edi","ecx"); + &ror ("edi",17); + &shr ("ecx",10); + &add ($T,&DWP(4*(8+15+16),"esp")); # T += X[-16] + &xor ("edi","ecx"); # sigma1(X[-2]) - &add ($T,"edi"); # T += sigma1(X[-2]) - # &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7], moved to BODY_00_15(1) + &add ($T,&DWP(4*(8+15+16-9),"esp")); # T += X[-7] + # &add ($T,"edi"); # T += sigma1(X[-2]) # &mov (&DWP(4*(8+15),"esp"),$T); # save X[0] &BODY_00_15(1); diff --git a/src/lib/libcrypto/sha/asm/sha256-armv4.pl b/src/lib/libcrypto/sha/asm/sha256-armv4.pl index 492cb62bc0..9c84e8d93c 100644 --- a/src/lib/libcrypto/sha/asm/sha256-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha256-armv4.pl @@ -18,11 +18,16 @@ # Rescheduling for dual-issue pipeline resulted in 22% improvement on # Cortex A8 core and ~20 cycles per processed byte. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 16% +# improvement on Cortex A8 core and ~17 cycles per processed byte. + while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; $ctx="r0"; $t0="r0"; -$inp="r1"; +$inp="r1"; $t3="r1"; $len="r2"; $t1="r2"; $T1="r3"; $A="r4"; @@ -46,6 +51,9 @@ sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___ if ($i<16); +#if __ARM_ARCH__>=7 + ldr $T1,[$inp],#4 +#else ldrb $T1,[$inp,#3] @ $i ldrb $t2,[$inp,#2] ldrb $t1,[$inp,#1] @@ -53,16 +61,24 @@ $code.=<<___ if ($i<16); orr $T1,$T1,$t2,lsl#8 orr $T1,$T1,$t1,lsl#16 orr $T1,$T1,$t0,lsl#24 - `"str $inp,[sp,#17*4]" if ($i==15)` +#endif ___ $code.=<<___; - ldr $t2,[$Ktbl],#4 @ *K256++ mov $t0,$e,ror#$Sigma1[0] - str $T1,[sp,#`$i%16`*4] + ldr $t2,[$Ktbl],#4 @ *K256++ eor $t0,$t0,$e,ror#$Sigma1[1] eor $t1,$f,$g +#if $i>=16 + add $T1,$T1,$t3 @ from BODY_16_xx +#elif __ARM_ARCH__>=7 && defined(__ARMEL__) + rev $T1,$T1 +#endif +#if $i==15 + str $inp,[sp,#17*4] @ leave room for $t3 +#endif eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) and $t1,$t1,$e + str $T1,[sp,#`$i%16`*4] add $T1,$T1,$t0 eor $t1,$t1,$g @ Ch(e,f,g) add $T1,$T1,$h @@ -71,6 +87,9 @@ $code.=<<___; eor $h,$h,$a,ror#$Sigma0[1] add $T1,$T1,$t2 eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) +#if $i>=15 + ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx +#endif orr $t0,$a,$b and $t1,$a,$b and $t0,$t0,$c @@ -85,24 +104,26 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - ldr $t1,[sp,#`($i+1)%16`*4] @ $i + @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i ldr $t2,[sp,#`($i+14)%16`*4] + mov $t0,$t3,ror#$sigma0[0] ldr $T1,[sp,#`($i+0)%16`*4] - mov $t0,$t1,ror#$sigma0[0] - ldr $inp,[sp,#`($i+9)%16`*4] - eor $t0,$t0,$t1,ror#$sigma0[1] - eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) - mov $t1,$t2,ror#$sigma1[0] + eor $t0,$t0,$t3,ror#$sigma0[1] + ldr $t1,[sp,#`($i+9)%16`*4] + eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) + mov $t3,$t2,ror#$sigma1[0] add $T1,$T1,$t0 - eor $t1,$t1,$t2,ror#$sigma1[1] - add $T1,$T1,$inp - eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) + eor $t3,$t3,$t2,ror#$sigma1[1] add $T1,$T1,$t1 + eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) + @ add $T1,$T1,$t3 ___ &BODY_00_15(@_); } $code=<<___; +#include "arm_arch.h" + .text .code 32 @@ -132,7 +153,7 @@ K256: sha256_block_data_order: sub r3,pc,#8 @ sha256_block_data_order add $len,$inp,$len,lsl#6 @ len to point at the end of inp - stmdb sp!,{$ctx,$inp,$len,r4-r12,lr} + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} sub $Ktbl,r3,#256 @ K256 sub sp,sp,#16*4 @ alloca(X[16]) @@ -171,10 +192,14 @@ $code.=<<___; bne .Loop add sp,sp,#`16+3`*4 @ destroy frame - ldmia sp!,{r4-r12,lr} +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r11,pc} +#else + ldmia sp!,{r4-r11,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) +#endif .size sha256_block_data_order,.-sha256_block_data_order .asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by " .align 2 diff --git a/src/lib/libcrypto/sha/asm/sha512-armv4.pl b/src/lib/libcrypto/sha/asm/sha512-armv4.pl index 3a35861ac6..7faf37b147 100644 --- a/src/lib/libcrypto/sha/asm/sha512-armv4.pl +++ b/src/lib/libcrypto/sha/asm/sha512-armv4.pl @@ -18,22 +18,33 @@ # Rescheduling for dual-issue pipeline resulted in 6% improvement on # Cortex A8 core and ~40 cycles per processed byte. +# February 2011. +# +# Profiler-assisted and platform-specific optimization resulted in 7% +# improvement on Coxtex A8 core and ~38 cycles per byte. + +# March 2011. +# +# Add NEON implementation. On Cortex A8 it was measured to process +# one byte in 25.5 cycles or 47% faster than integer-only code. + # Byte order [in]dependence. ========================================= # -# Caller is expected to maintain specific *dword* order in h[0-7], -# namely with most significant dword at *lower* address, which is -# reflected in below two parameters. *Byte* order within these dwords -# in turn is whatever *native* byte order on current platform. -$hi=0; -$lo=4; +# Originally caller was expected to maintain specific *dword* order in +# h[0-7], namely with most significant dword at *lower* address, which +# was reflected in below two parameters as 0 and 4. Now caller is +# expected to maintain native byte order for whole 64-bit values. +$hi="HI"; +$lo="LO"; # ==================================================================== while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; -$ctx="r0"; +$ctx="r0"; # parameter block $inp="r1"; $len="r2"; + $Tlo="r3"; $Thi="r4"; $Alo="r5"; @@ -61,15 +72,17 @@ $Xoff=8*8; sub BODY_00_15() { my $magic = shift; $code.=<<___; - ldr $t2,[sp,#$Hoff+0] @ h.lo - ldr $t3,[sp,#$Hoff+4] @ h.hi @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 mov $t0,$Elo,lsr#14 + str $Tlo,[sp,#$Xoff+0] mov $t1,$Ehi,lsr#14 + str $Thi,[sp,#$Xoff+4] eor $t0,$t0,$Ehi,lsl#18 + ldr $t2,[sp,#$Hoff+0] @ h.lo eor $t1,$t1,$Elo,lsl#18 + ldr $t3,[sp,#$Hoff+4] @ h.hi eor $t0,$t0,$Elo,lsr#18 eor $t1,$t1,$Ehi,lsr#18 eor $t0,$t0,$Ehi,lsl#14 @@ -96,25 +109,24 @@ $code.=<<___; and $t1,$t1,$Ehi str $Ahi,[sp,#$Aoff+4] eor $t0,$t0,$t2 - ldr $t2,[$Ktbl,#4] @ K[i].lo + ldr $t2,[$Ktbl,#$lo] @ K[i].lo eor $t1,$t1,$t3 @ Ch(e,f,g) - ldr $t3,[$Ktbl,#0] @ K[i].hi + ldr $t3,[$Ktbl,#$hi] @ K[i].hi adds $Tlo,$Tlo,$t0 ldr $Elo,[sp,#$Doff+0] @ d.lo adc $Thi,$Thi,$t1 @ T += Ch(e,f,g) ldr $Ehi,[sp,#$Doff+4] @ d.hi adds $Tlo,$Tlo,$t2 + and $t0,$t2,#0xff adc $Thi,$Thi,$t3 @ T += K[i] adds $Elo,$Elo,$Tlo + ldr $t2,[sp,#$Boff+0] @ b.lo adc $Ehi,$Ehi,$Thi @ d += T - - and $t0,$t2,#0xff teq $t0,#$magic - orreq $Ktbl,$Ktbl,#1 - ldr $t2,[sp,#$Boff+0] @ b.lo ldr $t3,[sp,#$Coff+0] @ c.lo + orreq $Ktbl,$Ktbl,#1 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 @@ -131,80 +143,100 @@ $code.=<<___; eor $t0,$t0,$Alo,lsl#25 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a) adds $Tlo,$Tlo,$t0 + and $t0,$Alo,$t2 adc $Thi,$Thi,$t1 @ T += Sigma0(a) - and $t0,$Alo,$t2 - orr $Alo,$Alo,$t2 ldr $t1,[sp,#$Boff+4] @ b.hi + orr $Alo,$Alo,$t2 ldr $t2,[sp,#$Coff+4] @ c.hi and $Alo,$Alo,$t3 - orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo and $t3,$Ahi,$t1 orr $Ahi,$Ahi,$t1 + orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo and $Ahi,$Ahi,$t2 - orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi adds $Alo,$Alo,$Tlo - adc $Ahi,$Ahi,$Thi @ h += T - + orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi sub sp,sp,#8 + adc $Ahi,$Ahi,$Thi @ h += T + tst $Ktbl,#1 add $Ktbl,$Ktbl,#8 ___ } $code=<<___; +#include "arm_arch.h" +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + .text .code 32 .type K512,%object .align 5 K512: -.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd -.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc -.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019 -.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118 -.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe -.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2 -.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1 -.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694 -.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3 -.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65 -.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483 -.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5 -.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210 -.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4 -.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725 -.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70 -.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926 -.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df -.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8 -.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b -.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001 -.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30 -.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910 -.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8 -.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53 -.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8 -.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb -.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3 -.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60 -.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec -.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9 -.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b -.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207 -.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178 -.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6 -.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b -.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493 -.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c -.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a -.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817 +WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) +WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) +WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) +WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) +WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) +WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) +WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) +WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) +WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) +WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) +WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) +WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) +WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) +WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) +WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) +WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) +WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) +WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) +WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) +WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) +WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) +WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) +WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) +WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) +WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) +WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) +WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) +WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) +WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) +WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) +WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) +WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) +WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) +WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) +WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) +WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) +WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) +WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) +WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) +WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .size K512,.-K512 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-sha512_block_data_order +.skip 32-4 .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: sub r3,pc,#8 @ sha512_block_data_order add $len,$inp,$len,lsl#7 @ len to point at the end of inp +#if __ARM_ARCH__>=7 + ldr r12,.LOPENSSL_armcap + ldr r12,[r3,r12] @ OPENSSL_armcap_P + tst r12,#1 + bne .LNEON +#endif stmdb sp!,{r4-r12,lr} - sub $Ktbl,r3,#640 @ K512 + sub $Ktbl,r3,#672 @ K512 sub sp,sp,#9*8 ldr $Elo,[$ctx,#$Eoff+$lo] @@ -238,6 +270,7 @@ sha512_block_data_order: str $Thi,[sp,#$Foff+4] .L00_15: +#if __ARM_ARCH__<7 ldrb $Tlo,[$inp,#7] ldrb $t0, [$inp,#6] ldrb $t1, [$inp,#5] @@ -252,26 +285,30 @@ sha512_block_data_order: orr $Thi,$Thi,$t3,lsl#8 orr $Thi,$Thi,$t0,lsl#16 orr $Thi,$Thi,$t1,lsl#24 - str $Tlo,[sp,#$Xoff+0] - str $Thi,[sp,#$Xoff+4] +#else + ldr $Tlo,[$inp,#4] + ldr $Thi,[$inp],#8 +#ifdef __ARMEL__ + rev $Tlo,$Tlo + rev $Thi,$Thi +#endif +#endif ___ &BODY_00_15(0x94); $code.=<<___; tst $Ktbl,#1 beq .L00_15 - bic $Ktbl,$Ktbl,#1 - -.L16_79: ldr $t0,[sp,#`$Xoff+8*(16-1)`+0] ldr $t1,[sp,#`$Xoff+8*(16-1)`+4] - ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] - ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] - + bic $Ktbl,$Ktbl,#1 +.L16_79: @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 mov $Tlo,$t0,lsr#1 + ldr $t2,[sp,#`$Xoff+8*(16-14)`+0] mov $Thi,$t1,lsr#1 + ldr $t3,[sp,#`$Xoff+8*(16-14)`+4] eor $Tlo,$Tlo,$t1,lsl#31 eor $Thi,$Thi,$t0,lsl#31 eor $Tlo,$Tlo,$t0,lsr#8 @@ -295,25 +332,24 @@ $code.=<<___; eor $t1,$t1,$t3,lsl#3 eor $t0,$t0,$t2,lsr#6 eor $t1,$t1,$t3,lsr#6 + ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] eor $t0,$t0,$t3,lsl#26 - ldr $t2,[sp,#`$Xoff+8*(16-9)`+0] ldr $t3,[sp,#`$Xoff+8*(16-9)`+4] adds $Tlo,$Tlo,$t0 + ldr $t0,[sp,#`$Xoff+8*16`+0] adc $Thi,$Thi,$t1 - ldr $t0,[sp,#`$Xoff+8*16`+0] ldr $t1,[sp,#`$Xoff+8*16`+4] adds $Tlo,$Tlo,$t2 adc $Thi,$Thi,$t3 adds $Tlo,$Tlo,$t0 adc $Thi,$Thi,$t1 - str $Tlo,[sp,#$Xoff+0] - str $Thi,[sp,#$Xoff+4] ___ &BODY_00_15(0x17); $code.=<<___; - tst $Ktbl,#1 + ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0] + ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4] beq .L16_79 bic $Ktbl,$Ktbl,#1 @@ -324,12 +360,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Boff+$lo] ldr $t3, [$ctx,#$Boff+$hi] adds $t0,$Alo,$t0 - adc $t1,$Ahi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $t0, [$ctx,#$Aoff+$lo] + adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Aoff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Boff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Boff+$hi] ldr $Alo,[sp,#$Coff+0] @@ -341,12 +377,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Doff+$lo] ldr $t3, [$ctx,#$Doff+$hi] adds $t0,$Alo,$t0 - adc $t1,$Ahi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $t0, [$ctx,#$Coff+$lo] + adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Coff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Doff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Doff+$hi] ldr $Tlo,[sp,#$Foff+0] @@ -356,12 +392,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Foff+$lo] ldr $t3, [$ctx,#$Foff+$hi] adds $Elo,$Elo,$t0 - adc $Ehi,$Ehi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $Elo,[$ctx,#$Eoff+$lo] + adc $Ehi,$Ehi,$t1 str $Ehi,[$ctx,#$Eoff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Foff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Foff+$hi] ldr $Alo,[sp,#$Goff+0] @@ -373,12 +409,12 @@ $code.=<<___; ldr $t2, [$ctx,#$Hoff+$lo] ldr $t3, [$ctx,#$Hoff+$hi] adds $t0,$Alo,$t0 - adc $t1,$Ahi,$t1 - adds $t2,$Tlo,$t2 - adc $t3,$Thi,$t3 str $t0, [$ctx,#$Goff+$lo] + adc $t1,$Ahi,$t1 str $t1, [$ctx,#$Goff+$hi] + adds $t2,$Tlo,$t2 str $t2, [$ctx,#$Hoff+$lo] + adc $t3,$Thi,$t3 str $t3, [$ctx,#$Hoff+$hi] add sp,sp,#640 @@ -388,13 +424,156 @@ $code.=<<___; bne .Loop add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH__>=5 + ldmia sp!,{r4-r12,pc} +#else ldmia sp!,{r4-r12,lr} tst lr,#1 moveq pc,lr @ be binary compatible with V4, yet bx lr @ interoperable with Thumb ISA:-) -.size sha512_block_data_order,.-sha512_block_data_order -.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by " +#endif +___ + +{ +my @Sigma0=(28,34,39); +my @Sigma1=(14,18,41); +my @sigma0=(1, 8, 7); +my @sigma1=(19,61,6); + +my $Ktbl="r3"; +my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch + +my @X=map("d$_",(0..15)); +my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23)); + +sub NEON_00_15() { +my $i=shift; +my ($a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps + +$code.=<<___ if ($i<16 || $i&1); + vshr.u64 $t0,$e,#@Sigma1[0] @ $i +#if $i<16 + vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned +#endif + vshr.u64 $t1,$e,#@Sigma1[1] + vshr.u64 $t2,$e,#@Sigma1[2] +___ +$code.=<<___; + vld1.64 {$K},[$Ktbl,:64]! @ K[i++] + vsli.64 $t0,$e,#`64-@Sigma1[0]` + vsli.64 $t1,$e,#`64-@Sigma1[1]` + vsli.64 $t2,$e,#`64-@Sigma1[2]` +#if $i<16 && defined(__ARMEL__) + vrev64.8 @X[$i],@X[$i] +#endif + vadd.i64 $T1,$K,$h + veor $Ch,$f,$g + veor $t0,$t1 + vand $Ch,$e + veor $t0,$t2 @ Sigma1(e) + veor $Ch,$g @ Ch(e,f,g) + vadd.i64 $T1,$t0 + vshr.u64 $t0,$a,#@Sigma0[0] + vadd.i64 $T1,$Ch + vshr.u64 $t1,$a,#@Sigma0[1] + vshr.u64 $t2,$a,#@Sigma0[2] + vsli.64 $t0,$a,#`64-@Sigma0[0]` + vsli.64 $t1,$a,#`64-@Sigma0[1]` + vsli.64 $t2,$a,#`64-@Sigma0[2]` + vadd.i64 $T1,@X[$i%16] + vorr $Maj,$a,$c + vand $Ch,$a,$c + veor $h,$t0,$t1 + vand $Maj,$b + veor $h,$t2 @ Sigma0(a) + vorr $Maj,$Ch @ Maj(a,b,c) + vadd.i64 $h,$T1 + vadd.i64 $d,$T1 + vadd.i64 $h,$Maj +___ +} + +sub NEON_16_79() { +my $i=shift; + +if ($i&1) { &NEON_00_15($i,@_); return; } + +# 2x-vectorized, therefore runs every 2nd round +my @X=map("q$_",(0..7)); # view @X as 128-bit vector +my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps +my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15 +my $e=@_[4]; # $e from NEON_00_15 +$i /= 2; +$code.=<<___; + vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0] + vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1] + vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2] + vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]` + vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1] + vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]` + veor $s1,$t0 + vshr.u64 $t0,$s0,#@sigma0[0] + veor $s1,$t1 @ sigma1(X[i+14]) + vshr.u64 $t1,$s0,#@sigma0[1] + vadd.i64 @X[$i%8],$s1 + vshr.u64 $s1,$s0,#@sigma0[2] + vsli.64 $t0,$s0,#`64-@sigma0[0]` + vsli.64 $t1,$s0,#`64-@sigma0[1]` + vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9] + veor $s1,$t0 + vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s0 + vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15 + veor $s1,$t1 @ sigma0(X[i+1]) + vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15 + vadd.i64 @X[$i%8],$s1 +___ + &NEON_00_15(2*$i,@_); +} + +$code.=<<___; +#if __ARM_ARCH__>=7 +.fpu neon + +.align 4 +.LNEON: + dmb @ errata #451034 on early Cortex A8 + vstmdb sp!,{d8-d15} @ ABI specification says so + sub $Ktbl,r3,#672 @ K512 + vldmia $ctx,{$A-$H} @ load context +.Loop_neon: +___ +for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + mov $cnt,#4 +.L16_79_neon: + subs $cnt,#1 +___ +for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bne .L16_79_neon + + vldmia $ctx,{d24-d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia $ctx,{$A-$H} @ save context + teq $inp,$len + sub $Ktbl,#640 @ rewind K512 + bne .Loop_neon + + vldmia sp!,{d8-d15} @ epilogue + bx lr +#endif +___ +} +$code.=<<___; +.size sha512_block_data_order,.-sha512_block_data_order +.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by " .align 2 +.comm OPENSSL_armcap_P,4,4 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/src/lib/libcrypto/sha/asm/sha512-mips.pl b/src/lib/libcrypto/sha/asm/sha512-mips.pl new file mode 100644 index 0000000000..ba5b250890 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha512-mips.pl @@ -0,0 +1,455 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA2 block procedures for MIPS. + +# October 2010. +# +# SHA256 performance improvement on MIPS R5000 CPU is ~27% over gcc- +# generated code in o32 build and ~55% in n32/64 build. SHA512 [which +# for now can only be compiled for MIPS64 ISA] improvement is modest +# ~17%, but it comes for free, because it's same instruction sequence. +# Improvement coefficients are for aligned input. + +###################################################################### +# There is a number of MIPS ABI in use, O32 and N32/64 are most +# widely used. Then there is a new contender: NUBI. It appears that if +# one picks the latter, it's possible to arrange code in ABI neutral +# manner. Therefore let's stick to NUBI register layout: +# +($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25)); +($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23)); +($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31)); +# +# The return value is placed in $a0. Following coding rules facilitate +# interoperability: +# +# - never ever touch $tp, "thread pointer", former $gp [o32 can be +# excluded from the rule, because it's specified volatile]; +# - copy return value to $t0, former $v0 [or to $a0 if you're adapting +# old code]; +# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary; +# +# For reference here is register layout for N32/64 MIPS ABIs: +# +# ($zero,$at,$v0,$v1)=map("\$$_",(0..3)); +# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11)); +# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25)); +# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23)); +# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31)); +# +$flavour = shift; # supported flavours are o32,n32,64,nubi32,nubi64 + +if ($flavour =~ /64|n32/i) { + $PTR_ADD="dadd"; # incidentally works even on n32 + $PTR_SUB="dsub"; # incidentally works even on n32 + $REG_S="sd"; + $REG_L="ld"; + $PTR_SLL="dsll"; # incidentally works even on n32 + $SZREG=8; +} else { + $PTR_ADD="add"; + $PTR_SUB="sub"; + $REG_S="sw"; + $REG_L="lw"; + $PTR_SLL="sll"; + $SZREG=4; +} +$pf = ($flavour =~ /nubi/i) ? $t0 : $t2; +# +# +# +###################################################################### + +$big_endian=(`echo MIPSEL | $ENV{CC} -E -P -`=~/MIPSEL/)?1:0; + +for (@ARGV) { $output=$_ if (/^\w[\w\-]*\.\w+$/); } +open STDOUT,">$output"; + +if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } + +if ($output =~ /512/) { + $label="512"; + $SZ=8; + $LD="ld"; # load from memory + $ST="sd"; # store to memory + $SLL="dsll"; # shift left logical + $SRL="dsrl"; # shift right logical + $ADDU="daddu"; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=( 7, 1, 8); # right shift first + @sigma1=( 6,19,61); # right shift first + $lastK=0x817; + $rounds=80; +} else { + $label="256"; + $SZ=4; + $LD="lw"; # load from memory + $ST="sw"; # store to memory + $SLL="sll"; # shift left logical + $SRL="srl"; # shift right logical + $ADDU="addu"; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 3, 7,18); # right shift first + @sigma1=(10,17,19); # right shift first + $lastK=0x8f2; + $rounds=64; +} + +$MSB = $big_endian ? 0 : ($SZ-1); +$LSB = ($SZ-1)&~$MSB; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("\$$_",(1,2,3,7,24,25,30,31)); +@X=map("\$$_",(8..23)); + +$ctx=$a0; +$inp=$a1; +$len=$a2; $Ktbl=$len; + +sub BODY_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +my ($T1,$tmp0,$tmp1,$tmp2)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___ if ($i<15); + ${LD}l @X[1],`($i+1)*$SZ+$MSB`($inp) + ${LD}r @X[1],`($i+1)*$SZ+$LSB`($inp) +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==4); + srl $tmp0,@X[0],24 # byte swap($i) + srl $tmp1,@X[0],8 + andi $tmp2,@X[0],0xFF00 + sll @X[0],@X[0],24 + andi $tmp1,0xFF00 + sll $tmp2,$tmp2,8 + or @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 +___ +$code.=<<___ if (!$big_endian && $i<16 && $SZ==8); + ori $tmp0,$zero,0xFF + dsll $tmp2,$tmp0,32 + or $tmp0,$tmp2 # 0x000000FF000000FF + and $tmp1,@X[0],$tmp0 # byte swap($i) + dsrl $tmp2,@X[0],24 + dsll $tmp1,24 + and $tmp2,$tmp0 + dsll $tmp0,8 # 0x0000FF000000FF00 + or $tmp1,$tmp2 + and $tmp2,@X[0],$tmp0 + dsrl @X[0],8 + dsll $tmp2,8 + and @X[0],$tmp0 + or $tmp1,$tmp2 + or @X[0],$tmp1 + dsrl $tmp1,@X[0],32 + dsll @X[0],32 + or @X[0],$tmp1 +___ +$code.=<<___; + $ADDU $T1,$X[0],$h # $i + $SRL $h,$e,@Sigma1[0] + xor $tmp2,$f,$g + $SLL $tmp1,$e,`$SZ*8-@Sigma1[2]` + and $tmp2,$e + $SRL $tmp0,$e,@Sigma1[1] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[1]` + xor $h,$tmp0 + $SRL $tmp0,$e,@Sigma1[2] + xor $h,$tmp1 + $SLL $tmp1,$e,`$SZ*8-@Sigma1[0]` + xor $h,$tmp0 + xor $tmp2,$g # Ch(e,f,g) + xor $tmp0,$tmp1,$h # Sigma1(e) + + $SRL $h,$a,@Sigma0[0] + $ADDU $T1,$tmp2 + $LD $tmp2,`$i*$SZ`($Ktbl) # K[$i] + $SLL $tmp1,$a,`$SZ*8-@Sigma0[2]` + $ADDU $T1,$tmp0 + $SRL $tmp0,$a,@Sigma0[1] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[1]` + xor $h,$tmp0 + $SRL $tmp0,$a,@Sigma0[2] + xor $h,$tmp1 + $SLL $tmp1,$a,`$SZ*8-@Sigma0[0]` + xor $h,$tmp0 + $ST @X[0],`($i%16)*$SZ`($sp) # offload to ring buffer + xor $h,$tmp1 # Sigma0(a) + + or $tmp0,$a,$b + and $tmp1,$a,$b + and $tmp0,$c + or $tmp1,$tmp0 # Maj(a,b,c) + $ADDU $T1,$tmp2 # +=K[$i] + $ADDU $h,$tmp1 + + $ADDU $d,$T1 + $ADDU $h,$T1 +___ +$code.=<<___ if ($i>=13); + $LD @X[3],`(($i+3)%16)*$SZ`($sp) # prefetch from ring buffer +___ +} + +sub BODY_16_XX { +my $i=@_[0]; +my ($tmp0,$tmp1,$tmp2,$tmp3)=(@X[4],@X[5],@X[6],@X[7]); + +$code.=<<___; + $SRL $tmp2,@X[1],@sigma0[0] # Xupdate($i) + $ADDU @X[0],@X[9] # +=X[i+9] + $SLL $tmp1,@X[1],`$SZ*8-@sigma0[2]` + $SRL $tmp0,@X[1],@sigma0[1] + xor $tmp2,$tmp1 + $SLL $tmp1,`@sigma0[2]-@sigma0[1]` + xor $tmp2,$tmp0 + $SRL $tmp0,@X[1],@sigma0[2] + xor $tmp2,$tmp1 + + $SRL $tmp3,@X[14],@sigma1[0] + xor $tmp2,$tmp0 # sigma0(X[i+1]) + $SLL $tmp1,@X[14],`$SZ*8-@sigma1[2]` + $ADDU @X[0],$tmp2 + $SRL $tmp0,@X[14],@sigma1[1] + xor $tmp3,$tmp1 + $SLL $tmp1,`@sigma1[2]-@sigma1[1]` + xor $tmp3,$tmp0 + $SRL $tmp0,@X[14],@sigma1[2] + xor $tmp3,$tmp1 + + xor $tmp3,$tmp0 # sigma1(X[i+14]) + $ADDU @X[0],$tmp3 +___ + &BODY_00_15(@_); +} + +$FRAMESIZE=16*$SZ+16*$SZREG; +$SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0xc0fff008 : 0xc0ff0000; + +$code.=<<___; +#ifdef OPENSSL_FIPSCANISTER +# include +#endif + +.text +.set noat +#if !defined(__vxworks) || defined(__pic__) +.option pic2 +#endif + +.align 5 +.globl sha${label}_block_data_order +.ent sha${label}_block_data_order +sha${label}_block_data_order: + .frame $sp,$FRAMESIZE,$ra + .mask $SAVED_REGS_MASK,-$SZREG + .set noreorder +___ +$code.=<<___ if ($flavour =~ /o32/i); # o32 PIC-ification + .cpload $pf +___ +$code.=<<___; + $PTR_SUB $sp,$FRAMESIZE + $REG_S $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_S $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_S $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_S $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_S $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_S $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_S $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_S $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_S $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_S $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); # optimize non-nubi prologue + $REG_S $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_S $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_S $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_S $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_S $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + $PTR_SLL @X[15],$len,`log(16*$SZ)/log(2)` +___ +$code.=<<___ if ($flavour !~ /o32/i); # non-o32 PIC-ification + .cplocal $Ktbl + .cpsetup $pf,$zero,sha${label}_block_data_order +___ +$code.=<<___; + .set reorder + la $Ktbl,K${label} # PIC-ified 'load address' + + $LD $A,0*$SZ($ctx) # load context + $LD $B,1*$SZ($ctx) + $LD $C,2*$SZ($ctx) + $LD $D,3*$SZ($ctx) + $LD $E,4*$SZ($ctx) + $LD $F,5*$SZ($ctx) + $LD $G,6*$SZ($ctx) + $LD $H,7*$SZ($ctx) + + $PTR_ADD @X[15],$inp # pointer to the end of input + $REG_S @X[15],16*$SZ($sp) + b .Loop + +.align 5 +.Loop: + ${LD}l @X[0],$MSB($inp) + ${LD}r @X[0],$LSB($inp) +___ +for ($i=0;$i<16;$i++) +{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + b .L16_xx +.align 4 +.L16_xx: +___ +for (;$i<32;$i++) +{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); push(@X,shift(@X)); } +$code.=<<___; + and @X[6],0xfff + li @X[7],$lastK + .set noreorder + bne @X[6],@X[7],.L16_xx + $PTR_ADD $Ktbl,16*$SZ # Ktbl+=16 + + $REG_L @X[15],16*$SZ($sp) # restore pointer to the end of input + $LD @X[0],0*$SZ($ctx) + $LD @X[1],1*$SZ($ctx) + $LD @X[2],2*$SZ($ctx) + $PTR_ADD $inp,16*$SZ + $LD @X[3],3*$SZ($ctx) + $ADDU $A,@X[0] + $LD @X[4],4*$SZ($ctx) + $ADDU $B,@X[1] + $LD @X[5],5*$SZ($ctx) + $ADDU $C,@X[2] + $LD @X[6],6*$SZ($ctx) + $ADDU $D,@X[3] + $LD @X[7],7*$SZ($ctx) + $ADDU $E,@X[4] + $ST $A,0*$SZ($ctx) + $ADDU $F,@X[5] + $ST $B,1*$SZ($ctx) + $ADDU $G,@X[6] + $ST $C,2*$SZ($ctx) + $ADDU $H,@X[7] + $ST $D,3*$SZ($ctx) + $ST $E,4*$SZ($ctx) + $ST $F,5*$SZ($ctx) + $ST $G,6*$SZ($ctx) + $ST $H,7*$SZ($ctx) + + bnel $inp,@X[15],.Loop + $PTR_SUB $Ktbl,`($rounds-16)*$SZ` # rewind $Ktbl + + $REG_L $ra,$FRAMESIZE-1*$SZREG($sp) + $REG_L $fp,$FRAMESIZE-2*$SZREG($sp) + $REG_L $s11,$FRAMESIZE-3*$SZREG($sp) + $REG_L $s10,$FRAMESIZE-4*$SZREG($sp) + $REG_L $s9,$FRAMESIZE-5*$SZREG($sp) + $REG_L $s8,$FRAMESIZE-6*$SZREG($sp) + $REG_L $s7,$FRAMESIZE-7*$SZREG($sp) + $REG_L $s6,$FRAMESIZE-8*$SZREG($sp) + $REG_L $s5,$FRAMESIZE-9*$SZREG($sp) + $REG_L $s4,$FRAMESIZE-10*$SZREG($sp) +___ +$code.=<<___ if ($flavour =~ /nubi/i); + $REG_L $s3,$FRAMESIZE-11*$SZREG($sp) + $REG_L $s2,$FRAMESIZE-12*$SZREG($sp) + $REG_L $s1,$FRAMESIZE-13*$SZREG($sp) + $REG_L $s0,$FRAMESIZE-14*$SZREG($sp) + $REG_L $gp,$FRAMESIZE-15*$SZREG($sp) +___ +$code.=<<___; + jr $ra + $PTR_ADD $sp,$FRAMESIZE +.end sha${label}_block_data_order + +.rdata +.align 5 +K${label}: +___ +if ($SZ==4) { +$code.=<<___; + .word 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .word 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .word 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .word 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .word 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .word 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .word 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .word 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .word 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .word 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .word 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .word 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .word 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .word 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .word 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +___ +} else { +$code.=<<___; + .dword 0x428a2f98d728ae22, 0x7137449123ef65cd + .dword 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc + .dword 0x3956c25bf348b538, 0x59f111f1b605d019 + .dword 0x923f82a4af194f9b, 0xab1c5ed5da6d8118 + .dword 0xd807aa98a3030242, 0x12835b0145706fbe + .dword 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2 + .dword 0x72be5d74f27b896f, 0x80deb1fe3b1696b1 + .dword 0x9bdc06a725c71235, 0xc19bf174cf692694 + .dword 0xe49b69c19ef14ad2, 0xefbe4786384f25e3 + .dword 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65 + .dword 0x2de92c6f592b0275, 0x4a7484aa6ea6e483 + .dword 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5 + .dword 0x983e5152ee66dfab, 0xa831c66d2db43210 + .dword 0xb00327c898fb213f, 0xbf597fc7beef0ee4 + .dword 0xc6e00bf33da88fc2, 0xd5a79147930aa725 + .dword 0x06ca6351e003826f, 0x142929670a0e6e70 + .dword 0x27b70a8546d22ffc, 0x2e1b21385c26c926 + .dword 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df + .dword 0x650a73548baf63de, 0x766a0abb3c77b2a8 + .dword 0x81c2c92e47edaee6, 0x92722c851482353b + .dword 0xa2bfe8a14cf10364, 0xa81a664bbc423001 + .dword 0xc24b8b70d0f89791, 0xc76c51a30654be30 + .dword 0xd192e819d6ef5218, 0xd69906245565a910 + .dword 0xf40e35855771202a, 0x106aa07032bbd1b8 + .dword 0x19a4c116b8d2d0c8, 0x1e376c085141ab53 + .dword 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8 + .dword 0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb + .dword 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3 + .dword 0x748f82ee5defb2fc, 0x78a5636f43172f60 + .dword 0x84c87814a1f0ab72, 0x8cc702081a6439ec + .dword 0x90befffa23631e28, 0xa4506cebde82bde9 + .dword 0xbef9a3f7b2c67915, 0xc67178f2e372532b + .dword 0xca273eceea26619c, 0xd186b8c721c0c207 + .dword 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178 + .dword 0x06f067aa72176fba, 0x0a637dc5a2c898a6 + .dword 0x113f9804bef90dae, 0x1b710b35131c471b + .dword 0x28db77f523047d84, 0x32caab7b40c72493 + .dword 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c + .dword 0x4cc5d4becb3e42b6, 0x597f299cfc657e2a + .dword 0x5fcb6fab3ad6faec, 0x6c44198c4a475817 +___ +} +$code.=<<___; +.asciiz "SHA${label} for MIPS, CRYPTOGAMS by " +.align 5 + +___ + +$code =~ s/\`([^\`]*)\`/eval $1/gem; +print $code; +close STDOUT; diff --git a/src/lib/libcrypto/sha/asm/sha512-parisc.pl b/src/lib/libcrypto/sha/asm/sha512-parisc.pl new file mode 100755 index 0000000000..e24ee58ae9 --- /dev/null +++ b/src/lib/libcrypto/sha/asm/sha512-parisc.pl @@ -0,0 +1,791 @@ +#!/usr/bin/env perl + +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# SHA256/512 block procedure for PA-RISC. + +# June 2009. +# +# SHA256 performance is >75% better than gcc 3.2 generated code on +# PA-7100LC. Compared to code generated by vendor compiler this +# implementation is almost 70% faster in 64-bit build, but delivers +# virtually same performance in 32-bit build on PA-8600. +# +# SHA512 performance is >2.9x better than gcc 3.2 generated code on +# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the +# code is executed on PA-RISC 2.0 processor and switches to 64-bit +# code path delivering adequate peformance even in "blended" 32-bit +# build. Though 64-bit code is not any faster than code generated by +# vendor compiler on PA-8600... +# +# Special thanks to polarhome.com for providing HP-UX account. + +$flavour = shift; +$output = shift; +open STDOUT,">$output"; + +if ($flavour =~ /64/) { + $LEVEL ="2.0W"; + $SIZE_T =8; + $FRAME_MARKER =80; + $SAVED_RP =16; + $PUSH ="std"; + $PUSHMA ="std,ma"; + $POP ="ldd"; + $POPMB ="ldd,mb"; +} else { + $LEVEL ="1.0"; + $SIZE_T =4; + $FRAME_MARKER =48; + $SAVED_RP =20; + $PUSH ="stw"; + $PUSHMA ="stwm"; + $POP ="ldw"; + $POPMB ="ldwm"; +} + +if ($output =~ /512/) { + $func="sha512_block_data_order"; + $SZ=8; + @Sigma0=(28,34,39); + @Sigma1=(14,18,41); + @sigma0=(1, 8, 7); + @sigma1=(19,61, 6); + $rounds=80; + $LAST10BITS=0x017; + $LD="ldd"; + $LDM="ldd,ma"; + $ST="std"; +} else { + $func="sha256_block_data_order"; + $SZ=4; + @Sigma0=( 2,13,22); + @Sigma1=( 6,11,25); + @sigma0=( 7,18, 3); + @sigma1=(17,19,10); + $rounds=64; + $LAST10BITS=0x0f2; + $LD="ldw"; + $LDM="ldwm"; + $ST="stw"; +} + +$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker + # [+ argument transfer] +$XOFF=16*$SZ+32; # local variables +$FRAME+=$XOFF; +$XOFF+=$FRAME_MARKER; # distance between %sp and local variables + +$ctx="%r26"; # zapped by $a0 +$inp="%r25"; # zapped by $a1 +$num="%r24"; # zapped by $t0 + +$a0 ="%r26"; +$a1 ="%r25"; +$t0 ="%r24"; +$t1 ="%r29"; +$Tbl="%r31"; + +@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28"); + +@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp); + +sub ROUND_00_15 { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$code.=<<___; + _ror $e,$Sigma1[0],$a0 + and $f,$e,$t0 + _ror $e,$Sigma1[1],$a1 + addl $t1,$h,$h + andcm $g,$e,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1 + or $t0,$t1,$t1 ; Ch(e,f,g) + addl @X[$i%16],$h,$h + xor $a0,$a1,$a1 ; Sigma1(e) + addl $t1,$h,$h + _ror $a,$Sigma0[0],$a0 + addl $a1,$h,$h + + _ror $a,$Sigma0[1],$a1 + and $a,$b,$t0 + and $a,$c,$t1 + xor $a1,$a0,$a0 + _ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $t1,$t0,$t0 + and $b,$c,$t1 + xor $a0,$a1,$a1 ; Sigma0(a) + addl $h,$d,$d + xor $t1,$t0,$t0 ; Maj(a,b,c) + `"$LDM $SZ($Tbl),$t1" if ($i<15)` + addl $a1,$h,$h + addl $t0,$h,$h + +___ +} + +sub ROUND_16_xx { +my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; +$i-=16; +$code.=<<___; + _ror @X[($i+1)%16],$sigma0[0],$a0 + _ror @X[($i+1)%16],$sigma0[1],$a1 + addl @X[($i+9)%16],@X[$i],@X[$i] + _ror @X[($i+14)%16],$sigma1[0],$t0 + _ror @X[($i+14)%16],$sigma1[1],$t1 + xor $a1,$a0,$a0 + _shr @X[($i+1)%16],$sigma0[2],$a1 + xor $t1,$t0,$t0 + _shr @X[($i+14)%16],$sigma1[2],$t1 + xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f]) + xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f]) + $LDM $SZ($Tbl),$t1 + addl $a0,@X[$i],@X[$i] + addl $t0,@X[$i],@X[$i] +___ +$code.=<<___ if ($i==15); + extru $t1,31,10,$a1 + comiclr,<> $LAST10BITS,$a1,%r0 + ldo 1($Tbl),$Tbl ; signal end of $Tbl +___ +&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h); +} + +$code=<<___; + .LEVEL $LEVEL + .SPACE \$TEXT\$ + .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY + + .ALIGN 64 +L\$table +___ +$code.=<<___ if ($SZ==8); + .WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd + .WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc + .WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019 + .WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118 + .WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe + .WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2 + .WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1 + .WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694 + .WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3 + .WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65 + .WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483 + .WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5 + .WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210 + .WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4 + .WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725 + .WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70 + .WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926 + .WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df + .WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8 + .WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b + .WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001 + .WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30 + .WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910 + .WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8 + .WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53 + .WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8 + .WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb + .WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3 + .WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60 + .WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec + .WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9 + .WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b + .WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207 + .WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178 + .WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6 + .WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b + .WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493 + .WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c + .WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a + .WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817 +___ +$code.=<<___ if ($SZ==4); + .WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +___ +$code.=<<___; + + .EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR + .ALIGN 64 +$func + .PROC + .CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18 + .ENTRY + $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue + $PUSHMA %r3,$FRAME(%sp) + $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) + $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) + $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) + $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) + $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) + $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) + $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) + $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) + $PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp) + $PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp) + $PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp) + $PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp) + $PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp) + $PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp) + $PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp) + + _shl $num,`log(16*$SZ)/log(2)`,$num + addl $inp,$num,$num ; $num to point at the end of $inp + + $PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) + $PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp) + + blr %r0,$Tbl + ldi 3,$t1 +L\$pic + andcm $Tbl,$t1,$Tbl ; wipe privilege level + ldo L\$table-L\$pic($Tbl),$Tbl +___ +$code.=<<___ if ($SZ==8 && $SIZE_T==4); + ldi 31,$t1 + mtctl $t1,%cr11 + extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0 + b L\$parisc1 + nop +___ +$code.=<<___; + $LD `0*$SZ`($ctx),$A ; load context + $LD `1*$SZ`($ctx),$B + $LD `2*$SZ`($ctx),$C + $LD `3*$SZ`($ctx),$D + $LD `4*$SZ`($ctx),$E + $LD `5*$SZ`($ctx),$F + $LD `6*$SZ`($ctx),$G + $LD `7*$SZ`($ctx),$H + + extru $inp,31,`log($SZ)/log(2)`,$t0 + sh3addl $t0,%r0,$t0 + subi `8*$SZ`,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop + ldi `$SZ-1`,$t0 + $LDM $SZ($Tbl),$t1 + andcm $inp,$t0,$t0 ; align $inp +___ + for ($i=0;$i<15;$i++) { # load input block + $code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; } +$code.=<<___; + cmpb,*= $inp,$t0,L\$aligned + $LD `$SZ*15`($t0),@X[15] + $LD `$SZ*16`($t0),@X[16] +___ + for ($i=0;$i<16;$i++) { # align data + $code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; } +$code.=<<___; +L\$aligned + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ + +for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; +L\$rounds + nop ; otherwise /usr/ccs/bin/as is confused by below .WORD +___ +for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); } +$code.=<<___; + bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled? + nop + + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl + + $LD `0*$SZ`($ctx),@X[0] ; load context + $LD `1*$SZ`($ctx),@X[1] + $LD `2*$SZ`($ctx),@X[2] + $LD `3*$SZ`($ctx),@X[3] + $LD `4*$SZ`($ctx),@X[4] + $LD `5*$SZ`($ctx),@X[5] + addl @X[0],$A,$A + $LD `6*$SZ`($ctx),@X[6] + addl @X[1],$B,$B + $LD `7*$SZ`($ctx),@X[7] + ldo `16*$SZ`($inp),$inp ; advance $inp + + $ST $A,`0*$SZ`($ctx) ; save context + addl @X[2],$C,$C + $ST $B,`1*$SZ`($ctx) + addl @X[3],$D,$D + $ST $C,`2*$SZ`($ctx) + addl @X[4],$E,$E + $ST $D,`3*$SZ`($ctx) + addl @X[5],$F,$F + $ST $E,`4*$SZ`($ctx) + addl @X[6],$G,$G + $ST $F,`5*$SZ`($ctx) + addl @X[7],$H,$H + $ST $G,`6*$SZ`($ctx) + $ST $H,`7*$SZ`($ctx) + + cmpb,*<>,n $inp,$num,L\$oop + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +___ +if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0 +{{ +$code.=<<___; + b L\$done + nop + + .ALIGN 64 +L\$parisc1 +___ + +@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo, + $Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) = + ( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8", + "%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16"); +$a0 ="%r17"; +$a1 ="%r18"; +$a2 ="%r19"; +$a3 ="%r20"; +$t0 ="%r21"; +$t1 ="%r22"; +$t2 ="%r28"; +$t3 ="%r29"; +$Tbl="%r31"; + +@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx + +sub ROUND_00_15_pa1 { +my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo, + $ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_; +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; + +$code.=<<___ if (!$flag); + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] +___ +$code.=<<___; + shd $ehi,$elo,$Sigma1[0],$t0 + add $Xlo,$hlo,$hlo + shd $elo,$ehi,$Sigma1[0],$t1 + addc $Xhi,$hhi,$hhi ; h += X[i] + shd $ehi,$elo,$Sigma1[1],$t2 + ldwm 8($Tbl),$Xhi + shd $elo,$ehi,$Sigma1[1],$t3 + ldw -4($Tbl),$Xlo ; load K[i] + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + and $flo,$elo,$a0 + and $fhi,$ehi,$a1 + shd $ehi,$elo,$Sigma1[2],$t2 + andcm $glo,$elo,$a2 + shd $elo,$ehi,$Sigma1[2],$t3 + andcm $ghi,$ehi,$a3 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma1(e) + add $Xlo,$hlo,$hlo + xor $a2,$a0,$a0 + addc $Xhi,$hhi,$hhi ; h += K[i] + xor $a3,$a1,$a1 ; Ch(e,f,g) + + add $t0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[0],$t0 + addc $t1,$hhi,$hhi ; h += Sigma1(e) + shd $alo,$ahi,$Sigma0[0],$t1 + add $a0,$hlo,$hlo + shd $ahi,$alo,$Sigma0[1],$t2 + addc $a1,$hhi,$hhi ; h += Ch(e,f,g) + shd $alo,$ahi,$Sigma0[1],$t3 + + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 + shd $ahi,$alo,$Sigma0[2],$t2 + and $alo,$blo,$a0 + shd $alo,$ahi,$Sigma0[2],$t3 + and $ahi,$bhi,$a1 + xor $t2,$t0,$t0 + xor $t3,$t1,$t1 ; Sigma0(a) + + and $alo,$clo,$a2 + and $ahi,$chi,$a3 + xor $a2,$a0,$a0 + add $hlo,$dlo,$dlo + xor $a3,$a1,$a1 + addc $hhi,$dhi,$dhi ; d += h + and $blo,$clo,$a2 + add $t0,$hlo,$hlo + and $bhi,$chi,$a3 + addc $t1,$hhi,$hhi ; h += Sigma0(a) + xor $a2,$a0,$a0 + add $a0,$hlo,$hlo + xor $a3,$a1,$a1 ; Maj(a,b,c) + addc $a1,$hhi,$hhi ; h += Maj(a,b,c) + +___ +$code.=<<___ if ($i==15 && $flag); + extru $Xlo,31,10,$Xlo + comiclr,= $LAST10BITS,$Xlo,%r0 + b L\$rounds_pa1 + nop +___ +push(@X,shift(@X)); push(@X,shift(@X)); +} + +sub ROUND_16_xx_pa1 { +my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X; +my ($i)=shift; +$i-=16; +$code.=<<___; + ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi + ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1] + ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1 + ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9] + ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3 + ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14] + shd $Xnhi,$Xnlo,$sigma0[0],$t0 + shd $Xnlo,$Xnhi,$sigma0[0],$t1 + add $a0,$Xlo,$Xlo + shd $Xnhi,$Xnlo,$sigma0[1],$t2 + addc $a1,$Xhi,$Xhi + shd $Xnlo,$Xnhi,$sigma0[1],$t3 + xor $t2,$t0,$t0 + shd $Xnhi,$Xnlo,$sigma0[2],$t2 + xor $t3,$t1,$t1 + extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3 + xor $t2,$t0,$t0 + shd $a3,$a2,$sigma1[0],$a0 + xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f]) + shd $a2,$a3,$sigma1[0],$a1 + add $t0,$Xlo,$Xlo + shd $a3,$a2,$sigma1[1],$t2 + addc $t1,$Xhi,$Xhi + shd $a2,$a3,$sigma1[1],$t3 + xor $t2,$a0,$a0 + shd $a3,$a2,$sigma1[2],$t2 + xor $t3,$a1,$a1 + extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3 + xor $t2,$a0,$a0 + xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f]) + add $a0,$Xlo,$Xlo + addc $a1,$Xhi,$Xhi + + stw $Xhi,`-$XOFF+8*($i%16)`(%sp) + stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp) +___ +&ROUND_00_15_pa1($i,@_,1); +} +$code.=<<___; + ldw `0*4`($ctx),$Ahi ; load context + ldw `1*4`($ctx),$Alo + ldw `2*4`($ctx),$Bhi + ldw `3*4`($ctx),$Blo + ldw `4*4`($ctx),$Chi + ldw `5*4`($ctx),$Clo + ldw `6*4`($ctx),$Dhi + ldw `7*4`($ctx),$Dlo + ldw `8*4`($ctx),$Ehi + ldw `9*4`($ctx),$Elo + ldw `10*4`($ctx),$Fhi + ldw `11*4`($ctx),$Flo + ldw `12*4`($ctx),$Ghi + ldw `13*4`($ctx),$Glo + ldw `14*4`($ctx),$Hhi + ldw `15*4`($ctx),$Hlo + + extru $inp,31,2,$t0 + sh3addl $t0,%r0,$t0 + subi 32,$t0,$t0 + mtctl $t0,%cr11 ; load %sar with align factor + +L\$oop_pa1 + extru $inp,31,2,$a3 + comib,= 0,$a3,L\$aligned_pa1 + sub $inp,$a3,$inp + + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + vshd $X[0],$X[1],$X[0] + vshd $X[1],$t2,$X[1] + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + vshd $t2,$t3,$t2 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 + vshd $t3,$a0,$t3 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<=(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +for (;$i<(128/4-1);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + vshd $t[1],$t[2],$t[1] +___ +push(@t,shift(@t)); +} +$code.=<<___; + b L\$collected_pa1 + stw $t[0],`-$XOFF+$i*4`(%sp) + +___ +} +$code.=<<___; +L\$aligned_pa1 + ldw `0*4`($inp),$X[0] + ldw `1*4`($inp),$X[1] + ldw `2*4`($inp),$t2 + ldw `3*4`($inp),$t3 + ldw `4*4`($inp),$a0 + ldw `5*4`($inp),$a1 + ldw `6*4`($inp),$a2 + ldw `7*4`($inp),$a3 + stw $X[0],`-$XOFF+0*4`(%sp) + ldw `8*4`($inp),$t0 + stw $X[1],`-$XOFF+1*4`(%sp) + ldw `9*4`($inp),$t1 +___ +{ +my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1); +for ($i=2;$i<(128/4-8);$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) + ldw `(8+$i)*4`($inp),$t[0] +___ +push(@t,shift(@t)); +} +for (;$i<128/4;$i++) { +$code.=<<___; + stw $t[0],`-$XOFF+$i*4`(%sp) +___ +push(@t,shift(@t)); +} +$code.="L\$collected_pa1\n"; +} + +for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } +$code.="L\$rounds_pa1\n"; +for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); } + +$code.=<<___; + $POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments + $POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp + $POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num + ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl + + ldw `0*4`($ctx),$t1 ; update context + ldw `1*4`($ctx),$t0 + ldw `2*4`($ctx),$t3 + ldw `3*4`($ctx),$t2 + ldw `4*4`($ctx),$a1 + ldw `5*4`($ctx),$a0 + ldw `6*4`($ctx),$a3 + add $t0,$Alo,$Alo + ldw `7*4`($ctx),$a2 + addc $t1,$Ahi,$Ahi + ldw `8*4`($ctx),$t1 + add $t2,$Blo,$Blo + ldw `9*4`($ctx),$t0 + addc $t3,$Bhi,$Bhi + ldw `10*4`($ctx),$t3 + add $a0,$Clo,$Clo + ldw `11*4`($ctx),$t2 + addc $a1,$Chi,$Chi + ldw `12*4`($ctx),$a1 + add $a2,$Dlo,$Dlo + ldw `13*4`($ctx),$a0 + addc $a3,$Dhi,$Dhi + ldw `14*4`($ctx),$a3 + add $t0,$Elo,$Elo + ldw `15*4`($ctx),$a2 + addc $t1,$Ehi,$Ehi + stw $Ahi,`0*4`($ctx) + add $t2,$Flo,$Flo + stw $Alo,`1*4`($ctx) + addc $t3,$Fhi,$Fhi + stw $Bhi,`2*4`($ctx) + add $a0,$Glo,$Glo + stw $Blo,`3*4`($ctx) + addc $a1,$Ghi,$Ghi + stw $Chi,`4*4`($ctx) + add $a2,$Hlo,$Hlo + stw $Clo,`5*4`($ctx) + addc $a3,$Hhi,$Hhi + stw $Dhi,`6*4`($ctx) + ldo `16*$SZ`($inp),$inp ; advance $inp + stw $Dlo,`7*4`($ctx) + stw $Ehi,`8*4`($ctx) + stw $Elo,`9*4`($ctx) + stw $Fhi,`10*4`($ctx) + stw $Flo,`11*4`($ctx) + stw $Ghi,`12*4`($ctx) + stw $Glo,`13*4`($ctx) + stw $Hhi,`14*4`($ctx) + comb,= $inp,$num,L\$done + stw $Hlo,`15*4`($ctx) + b L\$oop_pa1 + $PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp +L\$done +___ +}} +$code.=<<___; + $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue + $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 + $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 + $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 + $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 + $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 + $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 + $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 + $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 + $POP `-$FRAME+9*$SIZE_T`(%sp),%r12 + $POP `-$FRAME+10*$SIZE_T`(%sp),%r13 + $POP `-$FRAME+11*$SIZE_T`(%sp),%r14 + $POP `-$FRAME+12*$SIZE_T`(%sp),%r15 + $POP `-$FRAME+13*$SIZE_T`(%sp),%r16 + $POP `-$FRAME+14*$SIZE_T`(%sp),%r17 + $POP `-$FRAME+15*$SIZE_T`(%sp),%r18 + bv (%r2) + .EXIT + $POPMB -$FRAME(%sp),%r3 + .PROCEND + .STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by " +___ + +# Explicitly encode PA-RISC 2.0 instructions used in this module, so +# that it can be compiled with .LEVEL 1.0. It should be noted that I +# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 +# directive... + +my $ldd = sub { + my ($mod,$args) = @_; + my $orig = "ldd$mod\t$args"; + + if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices + { my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1); + $opcode|=(1<<3) if ($mod =~ /^,m/); + $opcode|=(1<<2) if ($mod =~ /^,mb/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $std = sub { + my ($mod,$args) = @_; + my $orig = "std$mod\t$args"; + + if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices + { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $extrd = sub { + my ($mod,$args) = @_; + my $orig = "extrd$mod\t$args"; + + # I only have ",u" completer, it's implicitly encoded... + if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 + { my $opcode=(0x36<<26)|($1<<21)|($4<<16); + my $len=32-$3; + $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos + $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 + { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); + my $len=32-$2; + $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len + $opcode |= (1<<13) if ($mod =~ /,\**=/); + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + else { "\t".$orig; } +}; + +my $shrpd = sub { + my ($mod,$args) = @_; + my $orig = "shrpd$mod\t$args"; + + if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 + { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; + my $cpos=63-$3; + $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa + sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; + } + elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 + { sprintf "\t.WORD\t0x%08x\t; %s", + (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; + } + else { "\t".$orig; } +}; + +sub assemble { + my ($mnemonic,$mod,$args)=@_; + my $opcode = eval("\$$mnemonic"); + + ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/ + $3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32 + : sprintf("shd\t%$1,%$2,%d",$3)/e or + # translate made up instructons: _ror, _shr, _align, _shl + s/_ror(\s+)(%r[0-9]+),/ + ($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or + + s/_shr(\s+%r[0-9]+),([0-9]+),/ + $SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or + + s/_align(\s+%r[0-9]+,%r[0-9]+),/ + ($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or + + s/_shl(\s+%r[0-9]+),([0-9]+),/ + $SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2) + : sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e; + + s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4); + + s/cmpb,\*/comb,/ if ($SIZE_T==4); + + print $_,"\n"; +} + +close STDOUT; diff --git a/src/lib/libcrypto/sha/asm/sha512-ppc.pl b/src/lib/libcrypto/sha/asm/sha512-ppc.pl index 768a6a6fad..6b44a68e59 100755 --- a/src/lib/libcrypto/sha/asm/sha512-ppc.pl +++ b/src/lib/libcrypto/sha/asm/sha512-ppc.pl @@ -40,6 +40,7 @@ $output =shift; if ($flavour =~ /64/) { $SIZE_T=8; + $LRSAVE=2*$SIZE_T; $STU="stdu"; $UCMP="cmpld"; $SHL="sldi"; @@ -47,6 +48,7 @@ if ($flavour =~ /64/) { $PUSH="std"; } elsif ($flavour =~ /32/) { $SIZE_T=4; + $LRSAVE=$SIZE_T; $STU="stwu"; $UCMP="cmplw"; $SHL="slwi"; @@ -87,7 +89,8 @@ if ($output =~ /512/) { $SHR="srwi"; } -$FRAME=32*$SIZE_T; +$FRAME=32*$SIZE_T+16*$SZ; +$LOCALS=6*$SIZE_T; $sp ="r1"; $toc="r2"; @@ -179,13 +182,12 @@ $code=<<___; .globl $func .align 6 $func: + $STU $sp,-$FRAME($sp) mflr r0 - $STU $sp,`-($FRAME+16*$SZ)`($sp) $SHL $num,$num,`log(16*$SZ)/log(2)` $PUSH $ctx,`$FRAME-$SIZE_T*22`($sp) - $PUSH r0,`$FRAME-$SIZE_T*21`($sp) $PUSH $toc,`$FRAME-$SIZE_T*20`($sp) $PUSH r13,`$FRAME-$SIZE_T*19`($sp) $PUSH r14,`$FRAME-$SIZE_T*18`($sp) @@ -206,6 +208,7 @@ $func: $PUSH r29,`$FRAME-$SIZE_T*3`($sp) $PUSH r30,`$FRAME-$SIZE_T*2`($sp) $PUSH r31,`$FRAME-$SIZE_T*1`($sp) + $PUSH r0,`$FRAME+$LRSAVE`($sp) $LD $A,`0*$SZ`($ctx) mr $inp,r4 ; incarnate $inp @@ -217,7 +220,7 @@ $func: $LD $G,`6*$SZ`($ctx) $LD $H,`7*$SZ`($ctx) - b LPICmeup + bl LPICmeup LPICedup: andi. r0,$inp,3 bne Lunaligned @@ -226,40 +229,14 @@ Laligned: $PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer bl Lsha2_block_private -Ldone: - $POP r0,`$FRAME-$SIZE_T*21`($sp) - $POP $toc,`$FRAME-$SIZE_T*20`($sp) - $POP r13,`$FRAME-$SIZE_T*19`($sp) - $POP r14,`$FRAME-$SIZE_T*18`($sp) - $POP r15,`$FRAME-$SIZE_T*17`($sp) - $POP r16,`$FRAME-$SIZE_T*16`($sp) - $POP r17,`$FRAME-$SIZE_T*15`($sp) - $POP r18,`$FRAME-$SIZE_T*14`($sp) - $POP r19,`$FRAME-$SIZE_T*13`($sp) - $POP r20,`$FRAME-$SIZE_T*12`($sp) - $POP r21,`$FRAME-$SIZE_T*11`($sp) - $POP r22,`$FRAME-$SIZE_T*10`($sp) - $POP r23,`$FRAME-$SIZE_T*9`($sp) - $POP r24,`$FRAME-$SIZE_T*8`($sp) - $POP r25,`$FRAME-$SIZE_T*7`($sp) - $POP r26,`$FRAME-$SIZE_T*6`($sp) - $POP r27,`$FRAME-$SIZE_T*5`($sp) - $POP r28,`$FRAME-$SIZE_T*4`($sp) - $POP r29,`$FRAME-$SIZE_T*3`($sp) - $POP r30,`$FRAME-$SIZE_T*2`($sp) - $POP r31,`$FRAME-$SIZE_T*1`($sp) - mtlr r0 - addi $sp,$sp,`$FRAME+16*$SZ` - blr -___ + b Ldone -# PowerPC specification allows an implementation to be ill-behaved -# upon unaligned access which crosses page boundary. "Better safe -# than sorry" principle makes me treat it specially. But I don't -# look for particular offending word, but rather for the input -# block which crosses the boundary. Once found that block is aligned -# and hashed separately... -$code.=<<___; +; PowerPC specification allows an implementation to be ill-behaved +; upon unaligned access which crosses page boundary. "Better safe +; than sorry" principle makes me treat it specially. But I don't +; look for particular offending word, but rather for the input +; block which crosses the boundary. Once found that block is aligned +; and hashed separately... .align 4 Lunaligned: subfic $t1,$inp,4096 @@ -278,7 +255,7 @@ Lunaligned: Lcross_page: li $t1,`16*$SZ/4` mtctr $t1 - addi r20,$sp,$FRAME ; aligned spot below the frame + addi r20,$sp,$LOCALS ; aligned spot below the frame Lmemcpy: lbz r16,0($inp) lbz r17,1($inp) @@ -293,8 +270,8 @@ Lmemcpy: bdnz Lmemcpy $PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp - addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer - addi $inp,$sp,$FRAME ; fictitious inp pointer + addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer + addi $inp,$sp,$LOCALS ; fictitious inp pointer $PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num $PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer $PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer @@ -303,10 +280,36 @@ Lmemcpy: $POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num addic. $num,$num,`-16*$SZ` ; num-- bne- Lunaligned - b Ldone -___ -$code.=<<___; +Ldone: + $POP r0,`$FRAME+$LRSAVE`($sp) + $POP $toc,`$FRAME-$SIZE_T*20`($sp) + $POP r13,`$FRAME-$SIZE_T*19`($sp) + $POP r14,`$FRAME-$SIZE_T*18`($sp) + $POP r15,`$FRAME-$SIZE_T*17`($sp) + $POP r16,`$FRAME-$SIZE_T*16`($sp) + $POP r17,`$FRAME-$SIZE_T*15`($sp) + $POP r18,`$FRAME-$SIZE_T*14`($sp) + $POP r19,`$FRAME-$SIZE_T*13`($sp) + $POP r20,`$FRAME-$SIZE_T*12`($sp) + $POP r21,`$FRAME-$SIZE_T*11`($sp) + $POP r22,`$FRAME-$SIZE_T*10`($sp) + $POP r23,`$FRAME-$SIZE_T*9`($sp) + $POP r24,`$FRAME-$SIZE_T*8`($sp) + $POP r25,`$FRAME-$SIZE_T*7`($sp) + $POP r26,`$FRAME-$SIZE_T*6`($sp) + $POP r27,`$FRAME-$SIZE_T*5`($sp) + $POP r28,`$FRAME-$SIZE_T*4`($sp) + $POP r29,`$FRAME-$SIZE_T*3`($sp) + $POP r30,`$FRAME-$SIZE_T*2`($sp) + $POP r31,`$FRAME-$SIZE_T*1`($sp) + mtlr r0 + addi $sp,$sp,$FRAME + blr + .long 0 + .byte 0,12,4,1,0x80,18,3,0 + .long 0 + .align 4 Lsha2_block_private: ___ @@ -372,6 +375,8 @@ $code.=<<___; $ST $H,`7*$SZ`($ctx) bne Lsha2_block_private blr + .long 0 + .byte 0,12,0x14,0,0,0,0,0 ___ # Ugly hack here, because PPC assembler syntax seem to vary too @@ -379,22 +384,15 @@ ___ $code.=<<___; .align 6 LPICmeup: - bl LPIC - addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop - b LPICedup - nop - nop - nop - nop - nop -LPIC: mflr $Tbl + mflr r0 + bcl 20,31,\$+4 + mflr $Tbl ; vvvvvv "distance" between . and 1st data entry + addi $Tbl,$Tbl,`64-8` + mtlr r0 blr - nop - nop - nop - nop - nop - nop + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + .space `64-9*4` ___ $code.=<<___ if ($SZ==8); .long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd diff --git a/src/lib/libcrypto/sha/asm/sha512-s390x.pl b/src/lib/libcrypto/sha/asm/sha512-s390x.pl index e7ef2d5a9f..079a3fc78a 100644 --- a/src/lib/libcrypto/sha/asm/sha512-s390x.pl +++ b/src/lib/libcrypto/sha/asm/sha512-s390x.pl @@ -26,6 +26,26 @@ # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster # than software. +# November 2010. +# +# Adapt for -m31 build. If kernel supports what's called "highgprs" +# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit +# instructions and achieve "64-bit" performance even in 31-bit legacy +# application context. The feature is not specific to any particular +# processor, as long as it's "z-CPU". Latter implies that the code +# remains z/Architecture specific. On z900 SHA256 was measured to +# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3. + +$flavour = shift; + +if ($flavour =~ /3[12]/) { + $SIZE_T=4; + $g=""; +} else { + $SIZE_T=8; + $g="g"; +} + $t0="%r0"; $t1="%r1"; $ctx="%r2"; $t2="%r2"; @@ -44,7 +64,7 @@ $tbl="%r13"; $T1="%r14"; $sp="%r15"; -$output=shift; +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} open STDOUT,">$output"; if ($output =~ /512/) { @@ -78,7 +98,8 @@ if ($output =~ /512/) { } $Func="sha${label}_block_data_order"; $Table="K${label}"; -$frame=160+16*$SZ; +$stdframe=16*$SIZE_T+4*8; +$frame=$stdframe+16*$SZ; sub BODY_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; @@ -93,9 +114,9 @@ $code.=<<___; xgr $t0,$t1 $ROT $t1,$t1,`$Sigma1[2]-$Sigma1[1]` xgr $t2,$g - $ST $T1,`160+$SZ*($i%16)`($sp) + $ST $T1,`$stdframe+$SZ*($i%16)`($sp) xgr $t0,$t1 # Sigma1(e) - la $T1,0($T1,$h) # T1+=h + algr $T1,$h # T1+=h ngr $t2,$e lgr $t1,$a algr $T1,$t0 # T1+=Sigma1(e) @@ -113,7 +134,7 @@ $code.=<<___; ngr $t2,$b algr $h,$T1 # h+=T1 ogr $t2,$t1 # Maj(a,b,c) - la $d,0($d,$T1) # d+=T1 + algr $d,$T1 # d+=T1 algr $h,$t2 # h+=Maj(a,b,c) ___ } @@ -122,19 +143,19 @@ sub BODY_16_XX { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - $LD $T1,`160+$SZ*(($i+1)%16)`($sp) ### $i - $LD $t1,`160+$SZ*(($i+14)%16)`($sp) + $LD $T1,`$stdframe+$SZ*(($i+1)%16)`($sp) ### $i + $LD $t1,`$stdframe+$SZ*(($i+14)%16)`($sp) $ROT $t0,$T1,$sigma0[0] $SHR $T1,$sigma0[2] $ROT $t2,$t0,`$sigma0[1]-$sigma0[0]` xgr $T1,$t0 $ROT $t0,$t1,$sigma1[0] - xgr $T1,$t2 # sigma0(X[i+1]) + xgr $T1,$t2 # sigma0(X[i+1]) $SHR $t1,$sigma1[2] - $ADD $T1,`160+$SZ*($i%16)`($sp) # +=X[i] + $ADD $T1,`$stdframe+$SZ*($i%16)`($sp) # +=X[i] xgr $t1,$t0 $ROT $t0,$t0,`$sigma1[1]-$sigma1[0]` - $ADD $T1,`160+$SZ*(($i+9)%16)`($sp) # +=X[i+9] + $ADD $T1,`$stdframe+$SZ*(($i+9)%16)`($sp) # +=X[i+9] xgr $t1,$t0 # sigma1(X[i+14]) algr $T1,$t1 # +=sigma1(X[i+14]) ___ @@ -212,6 +233,7 @@ $code.=<<___; .globl $Func .type $Func,\@function $Func: + sllg $len,$len,`log(16*$SZ)/log(2)` ___ $code.=<<___ if ($kimdfunc); larl %r1,OPENSSL_s390xcap_P @@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc); tmhl %r0,0x4000 # check for message-security assist jz .Lsoftware lghi %r0,0 - la %r1,16($sp) + la %r1,`2*$SIZE_T`($sp) .long 0xb93e0002 # kimd %r0,%r2 - lg %r0,16($sp) + lg %r0,`2*$SIZE_T`($sp) tmhh %r0,`0x8000>>$kimdfunc` jz .Lsoftware lghi %r0,$kimdfunc lgr %r1,$ctx lgr %r2,$inp - sllg %r3,$len,`log(16*$SZ)/log(2)` + lgr %r3,$len .long 0xb93e0002 # kimd %r0,%r2 brc 1,.-4 # pay attention to "partial completion" br %r14 @@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc); .Lsoftware: ___ $code.=<<___; - sllg $len,$len,`log(16*$SZ)/log(2)` lghi %r1,-$frame - agr $len,$inp - stmg $ctx,%r15,16($sp) + la $len,0($len,$inp) + stm${g} $ctx,%r15,`2*$SIZE_T`($sp) lgr %r0,$sp la $sp,0(%r1,$sp) - stg %r0,0($sp) + st${g} %r0,0($sp) larl $tbl,$Table $LD $A,`0*$SZ`($ctx) @@ -265,7 +286,7 @@ $code.=<<___; clgr $len,$t0 jne .Lrounds_16_xx - lg $ctx,`$frame+16`($sp) + l${g} $ctx,`$frame+2*$SIZE_T`($sp) la $inp,`16*$SZ`($inp) $ADD $A,`0*$SZ`($ctx) $ADD $B,`1*$SZ`($ctx) @@ -283,14 +304,14 @@ $code.=<<___; $ST $F,`5*$SZ`($ctx) $ST $G,`6*$SZ`($ctx) $ST $H,`7*$SZ`($ctx) - clg $inp,`$frame+32`($sp) + cl${g} $inp,`$frame+4*$SIZE_T`($sp) jne .Lloop - lmg %r6,%r15,`$frame+48`($sp) + lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp) br %r14 .size $Func,.-$Func .string "SHA${label} block transform for s390x, CRYPTOGAMS by " -.comm OPENSSL_s390xcap_P,8,8 +.comm OPENSSL_s390xcap_P,16,8 ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl index ec5d78135e..585740789e 100644 --- a/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl +++ b/src/lib/libcrypto/sha/asm/sha512-sparcv9.pl @@ -305,9 +305,9 @@ $code.=<<___; srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9] xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) srl @X[($i/2)%8],0,$tmp0 + add $tmp2,$tmp1,$tmp1 add $xi,$T1,$T1 ! +=X[i] xor $tmp0,@X[($i/2)%8],@X[($i/2)%8] - add $tmp2,$T1,$T1 add $tmp1,$T1,$T1 srl $T1,0,$T1 @@ -318,9 +318,9 @@ ___ $code.=<<___; srlx @X[($i/2)%8],32,$tmp1 ! X[i] xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14]) - srl @X[($i/2)%8],0,@X[($i/2)%8] add $xi,$T1,$T1 ! +=X[i+9] - add $tmp2,$T1,$T1 + add $tmp2,$tmp1,$tmp1 + srl @X[($i/2)%8],0,@X[($i/2)%8] add $tmp1,$T1,$T1 sllx $T1,32,$tmp0 diff --git a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl index e6643f8cf6..f611a2d898 100755 --- a/src/lib/libcrypto/sha/asm/sha512-x86_64.pl +++ b/src/lib/libcrypto/sha/asm/sha512-x86_64.pl @@ -95,50 +95,44 @@ sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - mov $e,$a0 - mov $e,$a1 + ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 + mov $T1,`$SZ*($i&0xf)`(%rsp) - ror \$$Sigma1[0],$a0 - ror \$$Sigma1[1],$a1 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 + xor $e,$a0 xor $g,$a2 # f^g - xor $a1,$a0 - ror \$`$Sigma1[2]-$Sigma1[1]`,$a1 + ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 + add $h,$T1 # T1+=h + xor $a,$a1 + + add ($Tbl,$round,$SZ),$T1 # T1+=K[round] and $e,$a2 # (f^g)&e - mov $T1,`$SZ*($i&0xf)`(%rsp) + mov $b,$h - xor $a1,$a0 # Sigma1(e) + ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 + xor $e,$a0 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g - add $h,$T1 # T1+=h - - mov $a,$h - add $a0,$T1 # T1+=Sigma1(e) + xor $c,$h # b^c + xor $a,$a1 add $a2,$T1 # T1+=Ch(e,f,g) - mov $a,$a0 - mov $a,$a1 + mov $b,$a2 - ror \$$Sigma0[0],$h - ror \$$Sigma0[1],$a0 - mov $a,$a2 - add ($Tbl,$round,$SZ),$T1 # T1+=K[round] + ror \$$Sigma1[0],$a0 # Sigma1(e) + and $a,$h # h=(b^c)&a + and $c,$a2 # b&c - xor $a0,$h - ror \$`$Sigma0[2]-$Sigma0[1]`,$a0 - or $c,$a1 # a|c + ror \$$Sigma0[0],$a1 # Sigma0(a) + add $a0,$T1 # T1+=Sigma1(e) + add $a2,$h # h+=b&c (completes +=Maj(a,b,c) - xor $a0,$h # h=Sigma0(a) - and $c,$a2 # a&c add $T1,$d # d+=T1 - - and $b,$a1 # (a|c)&b add $T1,$h # h+=T1 - - or $a2,$a1 # Maj(a,b,c)=((a|c)&b)|(a&c) lea 1($round),$round # round++ + add $a1,$h # h+=Sigma0(a) - add $a1,$h # h+=Maj(a,b,c) ___ } @@ -147,32 +141,30 @@ sub ROUND_16_XX() $code.=<<___; mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 - mov `$SZ*(($i+14)&0xf)`(%rsp),$T1 - - mov $a0,$a2 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a1 + mov $a0,$T1 + mov $a1,$a2 + ror \$`$sigma0[1]-$sigma0[0]`,$T1 + xor $a0,$T1 shr \$$sigma0[2],$a0 - ror \$$sigma0[0],$a2 - - xor $a2,$a0 - ror \$`$sigma0[1]-$sigma0[0]`,$a2 - xor $a2,$a0 # sigma0(X[(i+1)&0xf]) - mov $T1,$a1 + ror \$$sigma0[0],$T1 + xor $T1,$a0 # sigma0(X[(i+1)&0xf]) + mov `$SZ*(($i+9)&0xf)`(%rsp),$T1 - shr \$$sigma1[2],$T1 - ror \$$sigma1[0],$a1 - - xor $a1,$T1 - ror \$`$sigma1[1]-$sigma1[0]`,$a1 - - xor $a1,$T1 # sigma1(X[(i+14)&0xf]) + ror \$`$sigma1[1]-$sigma1[0]`,$a2 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + ror \$$sigma1[0],$a2 add $a0,$T1 - - add `$SZ*(($i+9)&0xf)`(%rsp),$T1 + xor $a2,$a1 # sigma1(X[(i+14)&0xf]) add `$SZ*($i&0xf)`(%rsp),$T1 + mov $e,$a0 + add $a1,$T1 + mov $a,$a1 ___ &ROUND_00_15(@_); } @@ -219,6 +211,8 @@ $func: ___ for($i=0;$i<16;$i++) { $code.=" mov $SZ*$i($inp),$T1\n"; + $code.=" mov @ROT[4],$a0\n"; + $code.=" mov @ROT[0],$a1\n"; $code.=" bswap $T1\n"; &ROUND_00_15($i,@ROT); unshift(@ROT,pop(@ROT)); diff --git a/src/lib/libcrypto/sha/sha.h b/src/lib/libcrypto/sha/sha.h index 16cacf9fc0..8a6bf4bbbb 100644 --- a/src/lib/libcrypto/sha/sha.h +++ b/src/lib/libcrypto/sha/sha.h @@ -106,6 +106,9 @@ typedef struct SHAstate_st } SHA_CTX; #ifndef OPENSSL_NO_SHA0 +#ifdef OPENSSL_FIPS +int private_SHA_Init(SHA_CTX *c); +#endif int SHA_Init(SHA_CTX *c); int SHA_Update(SHA_CTX *c, const void *data, size_t len); int SHA_Final(unsigned char *md, SHA_CTX *c); @@ -113,6 +116,9 @@ unsigned char *SHA(const unsigned char *d, size_t n, unsigned char *md); void SHA_Transform(SHA_CTX *c, const unsigned char *data); #endif #ifndef OPENSSL_NO_SHA1 +#ifdef OPENSSL_FIPS +int private_SHA1_Init(SHA_CTX *c); +#endif int SHA1_Init(SHA_CTX *c); int SHA1_Update(SHA_CTX *c, const void *data, size_t len); int SHA1_Final(unsigned char *md, SHA_CTX *c); @@ -135,6 +141,10 @@ typedef struct SHA256state_st } SHA256_CTX; #ifndef OPENSSL_NO_SHA256 +#ifdef OPENSSL_FIPS +int private_SHA224_Init(SHA256_CTX *c); +int private_SHA256_Init(SHA256_CTX *c); +#endif int SHA224_Init(SHA256_CTX *c); int SHA224_Update(SHA256_CTX *c, const void *data, size_t len); int SHA224_Final(unsigned char *md, SHA256_CTX *c); @@ -182,6 +192,10 @@ typedef struct SHA512state_st #endif #ifndef OPENSSL_NO_SHA512 +#ifdef OPENSSL_FIPS +int private_SHA384_Init(SHA512_CTX *c); +int private_SHA512_Init(SHA512_CTX *c); +#endif int SHA384_Init(SHA512_CTX *c); int SHA384_Update(SHA512_CTX *c, const void *data, size_t len); int SHA384_Final(unsigned char *md, SHA512_CTX *c); diff --git a/src/lib/libcrypto/sha/sha1dgst.c b/src/lib/libcrypto/sha/sha1dgst.c index 50d1925cde..81219af088 100644 --- a/src/lib/libcrypto/sha/sha1dgst.c +++ b/src/lib/libcrypto/sha/sha1dgst.c @@ -57,6 +57,7 @@ */ #include +#include #if !defined(OPENSSL_NO_SHA1) && !defined(OPENSSL_NO_SHA) #undef SHA_0 diff --git a/src/lib/libcrypto/sha/sha256.c b/src/lib/libcrypto/sha/sha256.c index 8952d87673..f88d3d6dad 100644 --- a/src/lib/libcrypto/sha/sha256.c +++ b/src/lib/libcrypto/sha/sha256.c @@ -16,7 +16,7 @@ const char SHA256_version[]="SHA-256" OPENSSL_VERSION_PTEXT; -int SHA224_Init (SHA256_CTX *c) +fips_md_init_ctx(SHA224, SHA256) { memset (c,0,sizeof(*c)); c->h[0]=0xc1059ed8UL; c->h[1]=0x367cd507UL; @@ -27,7 +27,7 @@ int SHA224_Init (SHA256_CTX *c) return 1; } -int SHA256_Init (SHA256_CTX *c) +fips_md_init(SHA256) { memset (c,0,sizeof(*c)); c->h[0]=0x6a09e667UL; c->h[1]=0xbb67ae85UL; diff --git a/src/lib/libcrypto/sha/sha512.c b/src/lib/libcrypto/sha/sha512.c index cbc0e58c48..50dd7dc744 100644 --- a/src/lib/libcrypto/sha/sha512.c +++ b/src/lib/libcrypto/sha/sha512.c @@ -59,21 +59,8 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT; #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA #endif -int SHA384_Init (SHA512_CTX *c) +fips_md_init_ctx(SHA384, SHA512) { -#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) - /* maintain dword order required by assembler module */ - unsigned int *h = (unsigned int *)c->h; - - h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8; - h[2] = 0x629a292a; h[3] = 0x367cd507; - h[4] = 0x9159015a; h[5] = 0x3070dd17; - h[6] = 0x152fecd8; h[7] = 0xf70e5939; - h[8] = 0x67332667; h[9] = 0xffc00b31; - h[10] = 0x8eb44a87; h[11] = 0x68581511; - h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7; - h[14] = 0x47b5481d; h[15] = 0xbefa4fa4; -#else c->h[0]=U64(0xcbbb9d5dc1059ed8); c->h[1]=U64(0x629a292a367cd507); c->h[2]=U64(0x9159015a3070dd17); @@ -82,27 +69,14 @@ int SHA384_Init (SHA512_CTX *c) c->h[5]=U64(0x8eb44a8768581511); c->h[6]=U64(0xdb0c2e0d64f98fa7); c->h[7]=U64(0x47b5481dbefa4fa4); -#endif + c->Nl=0; c->Nh=0; c->num=0; c->md_len=SHA384_DIGEST_LENGTH; return 1; } -int SHA512_Init (SHA512_CTX *c) +fips_md_init(SHA512) { -#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) - /* maintain dword order required by assembler module */ - unsigned int *h = (unsigned int *)c->h; - - h[0] = 0x6a09e667; h[1] = 0xf3bcc908; - h[2] = 0xbb67ae85; h[3] = 0x84caa73b; - h[4] = 0x3c6ef372; h[5] = 0xfe94f82b; - h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1; - h[8] = 0x510e527f; h[9] = 0xade682d1; - h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f; - h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b; - h[14] = 0x5be0cd19; h[15] = 0x137e2179; -#else c->h[0]=U64(0x6a09e667f3bcc908); c->h[1]=U64(0xbb67ae8584caa73b); c->h[2]=U64(0x3c6ef372fe94f82b); @@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c) c->h[5]=U64(0x9b05688c2b3e6c1f); c->h[6]=U64(0x1f83d9abfb41bd6b); c->h[7]=U64(0x5be0cd19137e2179); -#endif + c->Nl=0; c->Nh=0; c->num=0; c->md_len=SHA512_DIGEST_LENGTH; return 1; @@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) if (md==0) return 0; -#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm)) - /* recall assembler dword order... */ - n = c->md_len; - if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH) - { - unsigned int *h = (unsigned int *)c->h, t; - - for (n/=4;n;n--) - { - t = *(h++); - *(md++) = (unsigned char)(t>>24); - *(md++) = (unsigned char)(t>>16); - *(md++) = (unsigned char)(t>>8); - *(md++) = (unsigned char)(t); - } - } - else return 0; -#else switch (c->md_len) { /* Let compiler decide if it's appropriate to unroll... */ @@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) /* ... as well as make sure md_len is not abused. */ default: return 0; } -#endif + return 1; } diff --git a/src/lib/libcrypto/sha/sha_locl.h b/src/lib/libcrypto/sha/sha_locl.h index 672c26eee1..7a0c3ca8d8 100644 --- a/src/lib/libcrypto/sha/sha_locl.h +++ b/src/lib/libcrypto/sha/sha_locl.h @@ -122,7 +122,11 @@ void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num); #define INIT_DATA_h3 0x10325476UL #define INIT_DATA_h4 0xc3d2e1f0UL -int HASH_INIT (SHA_CTX *c) +#ifdef SHA_0 +fips_md_init(SHA) +#else +fips_md_init_ctx(SHA1, SHA) +#endif { memset (c,0,sizeof(*c)); c->h0=INIT_DATA_h0; diff --git a/src/lib/libcrypto/sparcv9cap.c b/src/lib/libcrypto/sparcv9cap.c index ed195ab402..43b3ac6f81 100644 --- a/src/lib/libcrypto/sparcv9cap.c +++ b/src/lib/libcrypto/sparcv9cap.c @@ -19,7 +19,8 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U int bn_mul_mont_fpu(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0, int num); - if ((OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == + if (num>=8 && !(num&1) && + (OPENSSL_sparcv9cap_P&(SPARCV9_PREFER_FPU|SPARCV9_VIS1)) == (SPARCV9_PREFER_FPU|SPARCV9_VIS1)) return bn_mul_mont_fpu(rp,ap,bp,np,n0,num); else @@ -169,7 +170,6 @@ void OPENSSL_cpuid_setup(void) char *e; struct sigaction common_act,ill_oact,bus_oact; sigset_t all_masked,oset; - int sig; static int trigger=0; if (trigger) return; diff --git a/src/lib/libcrypto/stack/safestack.h b/src/lib/libcrypto/stack/safestack.h index 3e76aa58f5..ea3aa0d800 100644 --- a/src/lib/libcrypto/stack/safestack.h +++ b/src/lib/libcrypto/stack/safestack.h @@ -1459,6 +1459,94 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void) #define sk_POLICY_MAPPING_sort(st) SKM_sk_sort(POLICY_MAPPING, (st)) #define sk_POLICY_MAPPING_is_sorted(st) SKM_sk_is_sorted(POLICY_MAPPING, (st)) +#define sk_SRP_gN_new(cmp) SKM_sk_new(SRP_gN, (cmp)) +#define sk_SRP_gN_new_null() SKM_sk_new_null(SRP_gN) +#define sk_SRP_gN_free(st) SKM_sk_free(SRP_gN, (st)) +#define sk_SRP_gN_num(st) SKM_sk_num(SRP_gN, (st)) +#define sk_SRP_gN_value(st, i) SKM_sk_value(SRP_gN, (st), (i)) +#define sk_SRP_gN_set(st, i, val) SKM_sk_set(SRP_gN, (st), (i), (val)) +#define sk_SRP_gN_zero(st) SKM_sk_zero(SRP_gN, (st)) +#define sk_SRP_gN_push(st, val) SKM_sk_push(SRP_gN, (st), (val)) +#define sk_SRP_gN_unshift(st, val) SKM_sk_unshift(SRP_gN, (st), (val)) +#define sk_SRP_gN_find(st, val) SKM_sk_find(SRP_gN, (st), (val)) +#define sk_SRP_gN_find_ex(st, val) SKM_sk_find_ex(SRP_gN, (st), (val)) +#define sk_SRP_gN_delete(st, i) SKM_sk_delete(SRP_gN, (st), (i)) +#define sk_SRP_gN_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN, (st), (ptr)) +#define sk_SRP_gN_insert(st, val, i) SKM_sk_insert(SRP_gN, (st), (val), (i)) +#define sk_SRP_gN_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN, (st), (cmp)) +#define sk_SRP_gN_dup(st) SKM_sk_dup(SRP_gN, st) +#define sk_SRP_gN_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN, (st), (free_func)) +#define sk_SRP_gN_shift(st) SKM_sk_shift(SRP_gN, (st)) +#define sk_SRP_gN_pop(st) SKM_sk_pop(SRP_gN, (st)) +#define sk_SRP_gN_sort(st) SKM_sk_sort(SRP_gN, (st)) +#define sk_SRP_gN_is_sorted(st) SKM_sk_is_sorted(SRP_gN, (st)) + +#define sk_SRP_gN_cache_new(cmp) SKM_sk_new(SRP_gN_cache, (cmp)) +#define sk_SRP_gN_cache_new_null() SKM_sk_new_null(SRP_gN_cache) +#define sk_SRP_gN_cache_free(st) SKM_sk_free(SRP_gN_cache, (st)) +#define sk_SRP_gN_cache_num(st) SKM_sk_num(SRP_gN_cache, (st)) +#define sk_SRP_gN_cache_value(st, i) SKM_sk_value(SRP_gN_cache, (st), (i)) +#define sk_SRP_gN_cache_set(st, i, val) SKM_sk_set(SRP_gN_cache, (st), (i), (val)) +#define sk_SRP_gN_cache_zero(st) SKM_sk_zero(SRP_gN_cache, (st)) +#define sk_SRP_gN_cache_push(st, val) SKM_sk_push(SRP_gN_cache, (st), (val)) +#define sk_SRP_gN_cache_unshift(st, val) SKM_sk_unshift(SRP_gN_cache, (st), (val)) +#define sk_SRP_gN_cache_find(st, val) SKM_sk_find(SRP_gN_cache, (st), (val)) +#define sk_SRP_gN_cache_find_ex(st, val) SKM_sk_find_ex(SRP_gN_cache, (st), (val)) +#define sk_SRP_gN_cache_delete(st, i) SKM_sk_delete(SRP_gN_cache, (st), (i)) +#define sk_SRP_gN_cache_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_gN_cache, (st), (ptr)) +#define sk_SRP_gN_cache_insert(st, val, i) SKM_sk_insert(SRP_gN_cache, (st), (val), (i)) +#define sk_SRP_gN_cache_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_gN_cache, (st), (cmp)) +#define sk_SRP_gN_cache_dup(st) SKM_sk_dup(SRP_gN_cache, st) +#define sk_SRP_gN_cache_pop_free(st, free_func) SKM_sk_pop_free(SRP_gN_cache, (st), (free_func)) +#define sk_SRP_gN_cache_shift(st) SKM_sk_shift(SRP_gN_cache, (st)) +#define sk_SRP_gN_cache_pop(st) SKM_sk_pop(SRP_gN_cache, (st)) +#define sk_SRP_gN_cache_sort(st) SKM_sk_sort(SRP_gN_cache, (st)) +#define sk_SRP_gN_cache_is_sorted(st) SKM_sk_is_sorted(SRP_gN_cache, (st)) + +#define sk_SRP_user_pwd_new(cmp) SKM_sk_new(SRP_user_pwd, (cmp)) +#define sk_SRP_user_pwd_new_null() SKM_sk_new_null(SRP_user_pwd) +#define sk_SRP_user_pwd_free(st) SKM_sk_free(SRP_user_pwd, (st)) +#define sk_SRP_user_pwd_num(st) SKM_sk_num(SRP_user_pwd, (st)) +#define sk_SRP_user_pwd_value(st, i) SKM_sk_value(SRP_user_pwd, (st), (i)) +#define sk_SRP_user_pwd_set(st, i, val) SKM_sk_set(SRP_user_pwd, (st), (i), (val)) +#define sk_SRP_user_pwd_zero(st) SKM_sk_zero(SRP_user_pwd, (st)) +#define sk_SRP_user_pwd_push(st, val) SKM_sk_push(SRP_user_pwd, (st), (val)) +#define sk_SRP_user_pwd_unshift(st, val) SKM_sk_unshift(SRP_user_pwd, (st), (val)) +#define sk_SRP_user_pwd_find(st, val) SKM_sk_find(SRP_user_pwd, (st), (val)) +#define sk_SRP_user_pwd_find_ex(st, val) SKM_sk_find_ex(SRP_user_pwd, (st), (val)) +#define sk_SRP_user_pwd_delete(st, i) SKM_sk_delete(SRP_user_pwd, (st), (i)) +#define sk_SRP_user_pwd_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRP_user_pwd, (st), (ptr)) +#define sk_SRP_user_pwd_insert(st, val, i) SKM_sk_insert(SRP_user_pwd, (st), (val), (i)) +#define sk_SRP_user_pwd_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRP_user_pwd, (st), (cmp)) +#define sk_SRP_user_pwd_dup(st) SKM_sk_dup(SRP_user_pwd, st) +#define sk_SRP_user_pwd_pop_free(st, free_func) SKM_sk_pop_free(SRP_user_pwd, (st), (free_func)) +#define sk_SRP_user_pwd_shift(st) SKM_sk_shift(SRP_user_pwd, (st)) +#define sk_SRP_user_pwd_pop(st) SKM_sk_pop(SRP_user_pwd, (st)) +#define sk_SRP_user_pwd_sort(st) SKM_sk_sort(SRP_user_pwd, (st)) +#define sk_SRP_user_pwd_is_sorted(st) SKM_sk_is_sorted(SRP_user_pwd, (st)) + +#define sk_SRTP_PROTECTION_PROFILE_new(cmp) SKM_sk_new(SRTP_PROTECTION_PROFILE, (cmp)) +#define sk_SRTP_PROTECTION_PROFILE_new_null() SKM_sk_new_null(SRTP_PROTECTION_PROFILE) +#define sk_SRTP_PROTECTION_PROFILE_free(st) SKM_sk_free(SRTP_PROTECTION_PROFILE, (st)) +#define sk_SRTP_PROTECTION_PROFILE_num(st) SKM_sk_num(SRTP_PROTECTION_PROFILE, (st)) +#define sk_SRTP_PROTECTION_PROFILE_value(st, i) SKM_sk_value(SRTP_PROTECTION_PROFILE, (st), (i)) +#define sk_SRTP_PROTECTION_PROFILE_set(st, i, val) SKM_sk_set(SRTP_PROTECTION_PROFILE, (st), (i), (val)) +#define sk_SRTP_PROTECTION_PROFILE_zero(st) SKM_sk_zero(SRTP_PROTECTION_PROFILE, (st)) +#define sk_SRTP_PROTECTION_PROFILE_push(st, val) SKM_sk_push(SRTP_PROTECTION_PROFILE, (st), (val)) +#define sk_SRTP_PROTECTION_PROFILE_unshift(st, val) SKM_sk_unshift(SRTP_PROTECTION_PROFILE, (st), (val)) +#define sk_SRTP_PROTECTION_PROFILE_find(st, val) SKM_sk_find(SRTP_PROTECTION_PROFILE, (st), (val)) +#define sk_SRTP_PROTECTION_PROFILE_find_ex(st, val) SKM_sk_find_ex(SRTP_PROTECTION_PROFILE, (st), (val)) +#define sk_SRTP_PROTECTION_PROFILE_delete(st, i) SKM_sk_delete(SRTP_PROTECTION_PROFILE, (st), (i)) +#define sk_SRTP_PROTECTION_PROFILE_delete_ptr(st, ptr) SKM_sk_delete_ptr(SRTP_PROTECTION_PROFILE, (st), (ptr)) +#define sk_SRTP_PROTECTION_PROFILE_insert(st, val, i) SKM_sk_insert(SRTP_PROTECTION_PROFILE, (st), (val), (i)) +#define sk_SRTP_PROTECTION_PROFILE_set_cmp_func(st, cmp) SKM_sk_set_cmp_func(SRTP_PROTECTION_PROFILE, (st), (cmp)) +#define sk_SRTP_PROTECTION_PROFILE_dup(st) SKM_sk_dup(SRTP_PROTECTION_PROFILE, st) +#define sk_SRTP_PROTECTION_PROFILE_pop_free(st, free_func) SKM_sk_pop_free(SRTP_PROTECTION_PROFILE, (st), (free_func)) +#define sk_SRTP_PROTECTION_PROFILE_shift(st) SKM_sk_shift(SRTP_PROTECTION_PROFILE, (st)) +#define sk_SRTP_PROTECTION_PROFILE_pop(st) SKM_sk_pop(SRTP_PROTECTION_PROFILE, (st)) +#define sk_SRTP_PROTECTION_PROFILE_sort(st) SKM_sk_sort(SRTP_PROTECTION_PROFILE, (st)) +#define sk_SRTP_PROTECTION_PROFILE_is_sorted(st) SKM_sk_is_sorted(SRTP_PROTECTION_PROFILE, (st)) + #define sk_SSL_CIPHER_new(cmp) SKM_sk_new(SSL_CIPHER, (cmp)) #define sk_SSL_CIPHER_new_null() SKM_sk_new_null(SSL_CIPHER) #define sk_SSL_CIPHER_free(st) SKM_sk_free(SSL_CIPHER, (st)) @@ -2056,31 +2144,6 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void) #define sk_OPENSSL_STRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_STRING, (st)) -#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp))) -#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null()) -#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val)) -#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val)) -#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i)) -#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st) -#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func)) -#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i) -#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st) -#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val)) -#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st)) -#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val)) -#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val)) -#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i)) -#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr)) -#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp) \ - ((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \ - sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp))) -#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st) -#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st)) -#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st)) -#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st)) -#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st)) - - #define sk_OPENSSL_BLOCK_new(cmp) ((STACK_OF(OPENSSL_BLOCK) *)sk_new(CHECKED_SK_CMP_FUNC(void, cmp))) #define sk_OPENSSL_BLOCK_new_null() ((STACK_OF(OPENSSL_BLOCK) *)sk_new_null()) #define sk_OPENSSL_BLOCK_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_BLOCK, st), CHECKED_PTR_OF(void, val)) @@ -2106,6 +2169,31 @@ DECLARE_SPECIAL_STACK_OF(OPENSSL_BLOCK, void) #define sk_OPENSSL_BLOCK_is_sorted(st) SKM_sk_is_sorted(OPENSSL_BLOCK, (st)) +#define sk_OPENSSL_PSTRING_new(cmp) ((STACK_OF(OPENSSL_PSTRING) *)sk_new(CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp))) +#define sk_OPENSSL_PSTRING_new_null() ((STACK_OF(OPENSSL_PSTRING) *)sk_new_null()) +#define sk_OPENSSL_PSTRING_push(st, val) sk_push(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val)) +#define sk_OPENSSL_PSTRING_find(st, val) sk_find(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val)) +#define sk_OPENSSL_PSTRING_value(st, i) ((OPENSSL_PSTRING)sk_value(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i)) +#define sk_OPENSSL_PSTRING_num(st) SKM_sk_num(OPENSSL_PSTRING, st) +#define sk_OPENSSL_PSTRING_pop_free(st, free_func) sk_pop_free(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_FREE_FUNC2(OPENSSL_PSTRING, free_func)) +#define sk_OPENSSL_PSTRING_insert(st, val, i) sk_insert(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val), i) +#define sk_OPENSSL_PSTRING_free(st) SKM_sk_free(OPENSSL_PSTRING, st) +#define sk_OPENSSL_PSTRING_set(st, i, val) sk_set(CHECKED_STACK_OF(OPENSSL_PSTRING, st), i, CHECKED_PTR_OF(OPENSSL_STRING, val)) +#define sk_OPENSSL_PSTRING_zero(st) SKM_sk_zero(OPENSSL_PSTRING, (st)) +#define sk_OPENSSL_PSTRING_unshift(st, val) sk_unshift(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, val)) +#define sk_OPENSSL_PSTRING_find_ex(st, val) sk_find_ex((_STACK *)CHECKED_CONST_PTR_OF(STACK_OF(OPENSSL_PSTRING), st), CHECKED_CONST_PTR_OF(OPENSSL_STRING, val)) +#define sk_OPENSSL_PSTRING_delete(st, i) SKM_sk_delete(OPENSSL_PSTRING, (st), (i)) +#define sk_OPENSSL_PSTRING_delete_ptr(st, ptr) (OPENSSL_PSTRING *)sk_delete_ptr(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_PTR_OF(OPENSSL_STRING, ptr)) +#define sk_OPENSSL_PSTRING_set_cmp_func(st, cmp) \ + ((int (*)(const OPENSSL_STRING * const *,const OPENSSL_STRING * const *)) \ + sk_set_cmp_func(CHECKED_STACK_OF(OPENSSL_PSTRING, st), CHECKED_SK_CMP_FUNC(OPENSSL_STRING, cmp))) +#define sk_OPENSSL_PSTRING_dup(st) SKM_sk_dup(OPENSSL_PSTRING, st) +#define sk_OPENSSL_PSTRING_shift(st) SKM_sk_shift(OPENSSL_PSTRING, (st)) +#define sk_OPENSSL_PSTRING_pop(st) (OPENSSL_STRING *)sk_pop(CHECKED_STACK_OF(OPENSSL_PSTRING, st)) +#define sk_OPENSSL_PSTRING_sort(st) SKM_sk_sort(OPENSSL_PSTRING, (st)) +#define sk_OPENSSL_PSTRING_is_sorted(st) SKM_sk_is_sorted(OPENSSL_PSTRING, (st)) + + #define d2i_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, length, d2i_func, free_func, ex_tag, ex_class) \ SKM_ASN1_SET_OF_d2i(ACCESS_DESCRIPTION, (st), (pp), (length), (d2i_func), (free_func), (ex_tag), (ex_class)) #define i2d_ASN1_SET_OF_ACCESS_DESCRIPTION(st, pp, i2d_func, ex_tag, ex_class, is_set) \ diff --git a/src/lib/libcrypto/ts/ts.h b/src/lib/libcrypto/ts/ts.h index 190e8a1bf2..c2448e3c3b 100644 --- a/src/lib/libcrypto/ts/ts.h +++ b/src/lib/libcrypto/ts/ts.h @@ -86,9 +86,6 @@ #include #endif -#include - - #ifdef __cplusplus extern "C" { #endif diff --git a/src/lib/libcrypto/ts/ts_rsp_verify.c b/src/lib/libcrypto/ts/ts_rsp_verify.c index e1f3b534af..afe16afbe4 100644 --- a/src/lib/libcrypto/ts/ts_rsp_verify.c +++ b/src/lib/libcrypto/ts/ts_rsp_verify.c @@ -614,12 +614,15 @@ static int TS_compute_imprint(BIO *data, TS_TST_INFO *tst_info, goto err; } - EVP_DigestInit(&md_ctx, md); + if (!EVP_DigestInit(&md_ctx, md)) + goto err; while ((length = BIO_read(data, buffer, sizeof(buffer))) > 0) { - EVP_DigestUpdate(&md_ctx, buffer, length); + if (!EVP_DigestUpdate(&md_ctx, buffer, length)) + goto err; } - EVP_DigestFinal(&md_ctx, *imprint, NULL); + if (!EVP_DigestFinal(&md_ctx, *imprint, NULL)) + goto err; return 1; err: diff --git a/src/lib/libcrypto/ui/ui.h b/src/lib/libcrypto/ui/ui.h index 2b1cfa2289..bd78aa413f 100644 --- a/src/lib/libcrypto/ui/ui.h +++ b/src/lib/libcrypto/ui/ui.h @@ -316,7 +316,7 @@ int (*UI_method_get_writer(UI_METHOD *method))(UI*,UI_STRING*); int (*UI_method_get_flusher(UI_METHOD *method))(UI*); int (*UI_method_get_reader(UI_METHOD *method))(UI*,UI_STRING*); int (*UI_method_get_closer(UI_METHOD *method))(UI*); -char* (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*); +char * (*UI_method_get_prompt_constructor(UI_METHOD *method))(UI*, const char*, const char*); /* The following functions are helpers for method writers to access relevant data from a UI_STRING. */ diff --git a/src/lib/libcrypto/ui/ui_openssl.c b/src/lib/libcrypto/ui/ui_openssl.c index 1bc25f48d5..5832a73cf5 100644 --- a/src/lib/libcrypto/ui/ui_openssl.c +++ b/src/lib/libcrypto/ui/ui_openssl.c @@ -122,7 +122,7 @@ * sigaction and fileno included. -pedantic would be more appropriate for * the intended purposes, but we can't prevent users from adding -ansi. */ -#ifndef _POSIX_C_SOURCE +#if !defined(_POSIX_C_SOURCE) && defined(OPENSSL_SYS_VMS) #define _POSIX_C_SOURCE 2 #endif #include diff --git a/src/lib/libcrypto/whrlpool/whrlpool.h b/src/lib/libcrypto/whrlpool/whrlpool.h index 03c91da115..9e01f5b076 100644 --- a/src/lib/libcrypto/whrlpool/whrlpool.h +++ b/src/lib/libcrypto/whrlpool/whrlpool.h @@ -24,6 +24,9 @@ typedef struct { } WHIRLPOOL_CTX; #ifndef OPENSSL_NO_WHIRLPOOL +#ifdef OPENSSL_FIPS +int private_WHIRLPOOL_Init(WHIRLPOOL_CTX *c); +#endif int WHIRLPOOL_Init (WHIRLPOOL_CTX *c); int WHIRLPOOL_Update (WHIRLPOOL_CTX *c,const void *inp,size_t bytes); void WHIRLPOOL_BitUpdate(WHIRLPOOL_CTX *c,const void *inp,size_t bits); diff --git a/src/lib/libcrypto/whrlpool/wp_block.c b/src/lib/libcrypto/whrlpool/wp_block.c index 221f6cc59f..824ed1827c 100644 --- a/src/lib/libcrypto/whrlpool/wp_block.c +++ b/src/lib/libcrypto/whrlpool/wp_block.c @@ -68,9 +68,9 @@ typedef unsigned long long u64; CPUs this is actually faster! */ # endif # define GO_FOR_MMX(ctx,inp,num) do { \ - extern unsigned long OPENSSL_ia32cap_P; \ + extern unsigned int OPENSSL_ia32cap_P[]; \ void whirlpool_block_mmx(void *,const void *,size_t); \ - if (!(OPENSSL_ia32cap_P & (1<<23))) break; \ + if (!(OPENSSL_ia32cap_P[0] & (1<<23))) break; \ whirlpool_block_mmx(ctx->H.c,inp,num); return; \ } while (0) # endif diff --git a/src/lib/libcrypto/whrlpool/wp_dgst.c b/src/lib/libcrypto/whrlpool/wp_dgst.c index ee5c5c1bf3..7e28bef51d 100644 --- a/src/lib/libcrypto/whrlpool/wp_dgst.c +++ b/src/lib/libcrypto/whrlpool/wp_dgst.c @@ -52,9 +52,10 @@ */ #include "wp_locl.h" +#include #include -int WHIRLPOOL_Init (WHIRLPOOL_CTX *c) +fips_md_init(WHIRLPOOL) { memset (c,0,sizeof(*c)); return(1); diff --git a/src/lib/libcrypto/x509/x509.h b/src/lib/libcrypto/x509/x509.h index e6f8a40395..092dd7450d 100644 --- a/src/lib/libcrypto/x509/x509.h +++ b/src/lib/libcrypto/x509/x509.h @@ -657,11 +657,15 @@ int NETSCAPE_SPKI_set_pubkey(NETSCAPE_SPKI *x, EVP_PKEY *pkey); int NETSCAPE_SPKI_print(BIO *out, NETSCAPE_SPKI *spki); +int X509_signature_dump(BIO *bp,const ASN1_STRING *sig, int indent); int X509_signature_print(BIO *bp,X509_ALGOR *alg, ASN1_STRING *sig); int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md); +int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx); int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md); +int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx); int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md); +int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx); int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md); int X509_pubkey_digest(const X509 *data,const EVP_MD *type, @@ -763,6 +767,7 @@ X509_ALGOR *X509_ALGOR_dup(X509_ALGOR *xn); int X509_ALGOR_set0(X509_ALGOR *alg, ASN1_OBJECT *aobj, int ptype, void *pval); void X509_ALGOR_get0(ASN1_OBJECT **paobj, int *pptype, void **ppval, X509_ALGOR *algor); +void X509_ALGOR_set_md(X509_ALGOR *alg, const EVP_MD *md); X509_NAME *X509_NAME_dup(X509_NAME *xn); X509_NAME_ENTRY *X509_NAME_ENTRY_dup(X509_NAME_ENTRY *ne); @@ -896,6 +901,9 @@ int ASN1_item_verify(const ASN1_ITEM *it, X509_ALGOR *algor1, int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2, ASN1_BIT_STRING *signature, void *data, EVP_PKEY *pkey, const EVP_MD *type); +int ASN1_item_sign_ctx(const ASN1_ITEM *it, + X509_ALGOR *algor1, X509_ALGOR *algor2, + ASN1_BIT_STRING *signature, void *asn, EVP_MD_CTX *ctx); #endif int X509_set_version(X509 *x,long version); @@ -1161,6 +1169,9 @@ X509_ALGOR *PKCS5_pbe2_set_iv(const EVP_CIPHER *cipher, int iter, unsigned char *salt, int saltlen, unsigned char *aiv, int prf_nid); +X509_ALGOR *PKCS5_pbkdf2_set(int iter, unsigned char *salt, int saltlen, + int prf_nid, int keylen); + /* PKCS#8 utilities */ DECLARE_ASN1_FUNCTIONS(PKCS8_PRIV_KEY_INFO) diff --git a/src/lib/libcrypto/x509/x509_cmp.c b/src/lib/libcrypto/x509/x509_cmp.c index 4bc9da07e0..7c2aaee2e9 100644 --- a/src/lib/libcrypto/x509/x509_cmp.c +++ b/src/lib/libcrypto/x509/x509_cmp.c @@ -87,15 +87,20 @@ unsigned long X509_issuer_and_serial_hash(X509 *a) EVP_MD_CTX_init(&ctx); f=X509_NAME_oneline(a->cert_info->issuer,NULL,0); ret=strlen(f); - EVP_DigestInit_ex(&ctx, EVP_md5(), NULL); - EVP_DigestUpdate(&ctx,(unsigned char *)f,ret); + if (!EVP_DigestInit_ex(&ctx, EVP_md5(), NULL)) + goto err; + if (!EVP_DigestUpdate(&ctx,(unsigned char *)f,ret)) + goto err; OPENSSL_free(f); - EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data, - (unsigned long)a->cert_info->serialNumber->length); - EVP_DigestFinal_ex(&ctx,&(md[0]),NULL); + if(!EVP_DigestUpdate(&ctx,(unsigned char *)a->cert_info->serialNumber->data, + (unsigned long)a->cert_info->serialNumber->length)) + goto err; + if (!EVP_DigestFinal_ex(&ctx,&(md[0]),NULL)) + goto err; ret=( ((unsigned long)md[0] )|((unsigned long)md[1]<<8L)| ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L) )&0xffffffffL; + err: EVP_MD_CTX_cleanup(&ctx); return(ret); } @@ -219,7 +224,9 @@ unsigned long X509_NAME_hash(X509_NAME *x) /* Make sure X509_NAME structure contains valid cached encoding */ i2d_X509_NAME(x,NULL); - EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), NULL); + if (!EVP_Digest(x->canon_enc, x->canon_enclen, md, NULL, EVP_sha1(), + NULL)) + return 0; ret=( ((unsigned long)md[0] )|((unsigned long)md[1]<<8L)| ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L) @@ -234,12 +241,18 @@ unsigned long X509_NAME_hash(X509_NAME *x) unsigned long X509_NAME_hash_old(X509_NAME *x) { + EVP_MD_CTX md_ctx; unsigned long ret=0; unsigned char md[16]; /* Make sure X509_NAME structure contains valid cached encoding */ i2d_X509_NAME(x,NULL); - EVP_Digest(x->bytes->data, x->bytes->length, md, NULL, EVP_md5(), NULL); + EVP_MD_CTX_init(&md_ctx); + EVP_MD_CTX_set_flags(&md_ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + EVP_DigestInit_ex(&md_ctx, EVP_md5(), NULL); + EVP_DigestUpdate(&md_ctx, x->bytes->data, x->bytes->length); + EVP_DigestFinal_ex(&md_ctx,md,NULL); + EVP_MD_CTX_cleanup(&md_ctx); ret=( ((unsigned long)md[0] )|((unsigned long)md[1]<<8L)| ((unsigned long)md[2]<<16L)|((unsigned long)md[3]<<24L) diff --git a/src/lib/libcrypto/x509/x509_lu.c b/src/lib/libcrypto/x509/x509_lu.c index 3a6e04a1de..38525a8cdd 100644 --- a/src/lib/libcrypto/x509/x509_lu.c +++ b/src/lib/libcrypto/x509/x509_lu.c @@ -87,7 +87,7 @@ void X509_LOOKUP_free(X509_LOOKUP *ctx) if (ctx == NULL) return; if ( (ctx->method != NULL) && (ctx->method->free != NULL)) - ctx->method->free(ctx); + (*ctx->method->free)(ctx); OPENSSL_free(ctx); } diff --git a/src/lib/libcrypto/x509/x509_vfy.c b/src/lib/libcrypto/x509/x509_vfy.c index 701ec565e9..b0779db023 100644 --- a/src/lib/libcrypto/x509/x509_vfy.c +++ b/src/lib/libcrypto/x509/x509_vfy.c @@ -153,7 +153,6 @@ static int x509_subject_cmp(X509 **a, X509 **b) int X509_verify_cert(X509_STORE_CTX *ctx) { X509 *x,*xtmp,*chain_ss=NULL; - X509_NAME *xn; int bad_chain = 0; X509_VERIFY_PARAM *param = ctx->param; int depth,i,ok=0; @@ -205,7 +204,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx) */ /* If we are self signed, we break */ - xn=X509_get_issuer_name(x); if (ctx->check_issued(ctx, x,x)) break; /* If we were passed a cert chain, use it first */ @@ -242,7 +240,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx) i=sk_X509_num(ctx->chain); x=sk_X509_value(ctx->chain,i-1); - xn = X509_get_subject_name(x); if (ctx->check_issued(ctx, x, x)) { /* we have a self signed certificate */ @@ -291,7 +288,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx) if (depth < num) break; /* If we are self signed, we break */ - xn=X509_get_issuer_name(x); if (ctx->check_issued(ctx,x,x)) break; ok = ctx->get_issuer(&xtmp, ctx, x); @@ -310,7 +306,6 @@ int X509_verify_cert(X509_STORE_CTX *ctx) } /* we now have our chain, lets check it... */ - xn=X509_get_issuer_name(x); /* Is last certificate looked up self signed? */ if (!ctx->check_issued(ctx,x,x)) diff --git a/src/lib/libcrypto/x509/x509type.c b/src/lib/libcrypto/x509/x509type.c index 3385ad3f67..9702ec5310 100644 --- a/src/lib/libcrypto/x509/x509type.c +++ b/src/lib/libcrypto/x509/x509type.c @@ -100,20 +100,26 @@ int X509_certificate_type(X509 *x, EVP_PKEY *pkey) break; } - i=X509_get_signature_type(x); - switch (i) + i=OBJ_obj2nid(x->sig_alg->algorithm); + if (i && OBJ_find_sigid_algs(i, NULL, &i)) { - case EVP_PKEY_RSA: - ret|=EVP_PKS_RSA; - break; - case EVP_PKEY_DSA: - ret|=EVP_PKS_DSA; - break; - case EVP_PKEY_EC: - ret|=EVP_PKS_EC; - break; - default: - break; + + switch (i) + { + case NID_rsaEncryption: + case NID_rsa: + ret|=EVP_PKS_RSA; + break; + case NID_dsa: + case NID_dsa_2: + ret|=EVP_PKS_DSA; + break; + case NID_X9_62_id_ecPublicKey: + ret|=EVP_PKS_EC; + break; + default: + break; + } } if (EVP_PKEY_size(pk) <= 1024/8)/* /8 because it's 1024 bits we look diff --git a/src/lib/libcrypto/x509/x_all.c b/src/lib/libcrypto/x509/x_all.c index 8ec88c215a..b94aeeb873 100644 --- a/src/lib/libcrypto/x509/x_all.c +++ b/src/lib/libcrypto/x509/x_all.c @@ -95,12 +95,25 @@ int X509_sign(X509 *x, EVP_PKEY *pkey, const EVP_MD *md) x->sig_alg, x->signature, x->cert_info,pkey,md)); } +int X509_sign_ctx(X509 *x, EVP_MD_CTX *ctx) + { + return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CINF), + x->cert_info->signature, + x->sig_alg, x->signature, x->cert_info, ctx); + } + int X509_REQ_sign(X509_REQ *x, EVP_PKEY *pkey, const EVP_MD *md) { return(ASN1_item_sign(ASN1_ITEM_rptr(X509_REQ_INFO),x->sig_alg, NULL, x->signature, x->req_info,pkey,md)); } +int X509_REQ_sign_ctx(X509_REQ *x, EVP_MD_CTX *ctx) + { + return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_REQ_INFO), + x->sig_alg, NULL, x->signature, x->req_info, ctx); + } + int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md) { x->crl->enc.modified = 1; @@ -108,6 +121,12 @@ int X509_CRL_sign(X509_CRL *x, EVP_PKEY *pkey, const EVP_MD *md) x->sig_alg, x->signature, x->crl,pkey,md)); } +int X509_CRL_sign_ctx(X509_CRL *x, EVP_MD_CTX *ctx) + { + return ASN1_item_sign_ctx(ASN1_ITEM_rptr(X509_CRL_INFO), + x->crl->sig_alg, x->sig_alg, x->signature, x->crl, ctx); + } + int NETSCAPE_SPKI_sign(NETSCAPE_SPKI *x, EVP_PKEY *pkey, const EVP_MD *md) { return(ASN1_item_sign(ASN1_ITEM_rptr(NETSCAPE_SPKAC), x->sig_algor,NULL, diff --git a/src/lib/libcrypto/x509v3/v3_skey.c b/src/lib/libcrypto/x509v3/v3_skey.c index 202c9e4896..0a984fbaa8 100644 --- a/src/lib/libcrypto/x509v3/v3_skey.c +++ b/src/lib/libcrypto/x509v3/v3_skey.c @@ -129,7 +129,8 @@ static ASN1_OCTET_STRING *s2i_skey_id(X509V3_EXT_METHOD *method, goto err; } - EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL); + if (!EVP_Digest(pk->data, pk->length, pkey_dig, &diglen, EVP_sha1(), NULL)) + goto err; if(!M_ASN1_OCTET_STRING_set(oct, pkey_dig, diglen)) { X509V3err(X509V3_F_S2I_SKEY_ID,ERR_R_MALLOC_FAILURE); diff --git a/src/lib/libcrypto/x86_64cpuid.pl b/src/lib/libcrypto/x86_64cpuid.pl index c96821a3c8..7b7b93b223 100644 --- a/src/lib/libcrypto/x86_64cpuid.pl +++ b/src/lib/libcrypto/x86_64cpuid.pl @@ -7,15 +7,24 @@ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output"; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $flavour $output"; + +($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order -if ($win64) { $arg1="%rcx"; $arg2="%rdx"; } -else { $arg1="%rdi"; $arg2="%rsi"; } print<<___; .extern OPENSSL_cpuid_setup +.hidden OPENSSL_cpuid_setup .section .init call OPENSSL_cpuid_setup +.hidden OPENSSL_ia32cap_P +.comm OPENSSL_ia32cap_P,8,4 + .text .globl OPENSSL_atomic_add @@ -46,7 +55,7 @@ OPENSSL_rdtsc: .type OPENSSL_ia32_cpuid,\@abi-omnipotent .align 16 OPENSSL_ia32_cpuid: - mov %rbx,%r8 + mov %rbx,%r8 # save %rbx xor %eax,%eax cpuid @@ -78,7 +87,15 @@ OPENSSL_ia32_cpuid: # AMD specific mov \$0x80000000,%eax cpuid - cmp \$0x80000008,%eax + cmp \$0x80000001,%eax + jb .Lintel + mov %eax,%r10d + mov \$0x80000001,%eax + cpuid + or %ecx,%r9d + and \$0x00000801,%r9d # isolate AMD XOP bit, 1<<11 + + cmp \$0x80000008,%r10d jb .Lintel mov \$0x80000008,%eax @@ -89,12 +106,12 @@ OPENSSL_ia32_cpuid: mov \$1,%eax cpuid bt \$28,%edx # test hyper-threading bit - jnc .Ldone + jnc .Lgeneric shr \$16,%ebx # number of logical processors cmp %r10b,%bl - ja .Ldone + ja .Lgeneric and \$0xefffffff,%edx # ~(1<<28) - jmp .Ldone + jmp .Lgeneric .Lintel: cmp \$4,%r11d @@ -111,30 +128,47 @@ OPENSSL_ia32_cpuid: .Lnocacheinfo: mov \$1,%eax cpuid + and \$0xbfefffff,%edx # force reserved bits to 0 cmp \$0,%r9d jne .Lnotintel - or \$0x00100000,%edx # use reserved 20th bit to engage RC4_CHAR + or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs and \$15,%ah cmp \$15,%ah # examine Family ID - je .Lnotintel - or \$0x40000000,%edx # use reserved bit to skip unrolled loop + jne .Lnotintel + or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR .Lnotintel: bt \$28,%edx # test hyper-threading bit - jnc .Ldone + jnc .Lgeneric and \$0xefffffff,%edx # ~(1<<28) cmp \$0,%r10d - je .Ldone + je .Lgeneric or \$0x10000000,%edx # 1<<28 shr \$16,%ebx cmp \$1,%bl # see if cache is shared - ja .Ldone + ja .Lgeneric and \$0xefffffff,%edx # ~(1<<28) +.Lgeneric: + and \$0x00000800,%r9d # isolate AMD XOP flag + and \$0xfffff7ff,%ecx + or %ecx,%r9d # merge AMD XOP flag + + mov %edx,%r10d # %r9d:%r10d is copy of %ecx:%edx + bt \$27,%r9d # check OSXSAVE bit + jnc .Lclear_avx + xor %ecx,%ecx # XCR0 + .byte 0x0f,0x01,0xd0 # xgetbv + and \$6,%eax # isolate XMM and YMM state support + cmp \$6,%eax + je .Ldone +.Lclear_avx: + mov \$0xefffe7ff,%eax # ~(1<<28|1<<12|1<<11) + and %eax,%r9d # clear AVX, FMA and AMD XOP bits .Ldone: - shl \$32,%rcx - mov %edx,%eax - mov %r8,%rbx - or %rcx,%rax + shl \$32,%r9 + mov %r10d,%eax + mov %r8,%rbx # restore %rbx + or %r9,%rax ret .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid @@ -229,4 +263,21 @@ OPENSSL_wipe_cpu: .size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu ___ +print<<___; +.globl OPENSSL_ia32_rdrand +.type OPENSSL_ia32_rdrand,\@abi-omnipotent +.align 16 +OPENSSL_ia32_rdrand: + mov \$8,%ecx +.Loop_rdrand: + rdrand %rax + jc .Lbreak_rdrand + loop .Loop_rdrand +.Lbreak_rdrand: + cmp \$0,%rax + cmove %rcx,%rax + ret +.size OPENSSL_ia32_rdrand,.-OPENSSL_ia32_rdrand +___ + close STDOUT; # flush diff --git a/src/lib/libcrypto/x86cpuid.pl b/src/lib/libcrypto/x86cpuid.pl index a7464af19b..39fd8f2293 100644 --- a/src/lib/libcrypto/x86cpuid.pl +++ b/src/lib/libcrypto/x86cpuid.pl @@ -19,9 +19,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &pushf (); &pop ("eax"); &xor ("ecx","eax"); - &bt ("ecx",21); - &jnc (&label("done")); &xor ("eax","eax"); + &bt ("ecx",21); + &jnc (&label("nocpuid")); &cpuid (); &mov ("edi","eax"); # max value for standard query level @@ -51,7 +51,14 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } # AMD specific &mov ("eax",0x80000000); &cpuid (); - &cmp ("eax",0x80000008); + &cmp ("eax",0x80000001); + &jb (&label("intel")); + &mov ("esi","eax"); + &mov ("eax",0x80000001); + &cpuid (); + &or ("ebp","ecx"); + &and ("ebp",1<<11|1); # isolate XOP bit + &cmp ("esi",0x80000008); &jb (&label("intel")); &mov ("eax",0x80000008); @@ -62,13 +69,13 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &mov ("eax",1); &cpuid (); &bt ("edx",28); - &jnc (&label("done")); + &jnc (&label("generic")); &shr ("ebx",16); &and ("ebx",0xff); &cmp ("ebx","esi"); - &ja (&label("done")); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit - &jmp (&label("done")); + &jmp (&label("generic")); &set_label("intel"); &cmp ("edi",4); @@ -85,27 +92,51 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &set_label("nocacheinfo"); &mov ("eax",1); &cpuid (); + &and ("edx",0xbfefffff); # force reserved bits #20, #30 to 0 &cmp ("ebp",0); - &jne (&label("notP4")); + &jne (&label("notintel")); + &or ("edx",1<<30); # set reserved bit#30 on Intel CPUs &and (&HB("eax"),15); # familiy ID &cmp (&HB("eax"),15); # P4? - &jne (&label("notP4")); - &or ("edx",1<<20); # use reserved bit to engage RC4_CHAR -&set_label("notP4"); + &jne (&label("notintel")); + &or ("edx",1<<20); # set reserved bit#20 to engage RC4_CHAR +&set_label("notintel"); &bt ("edx",28); # test hyper-threading bit - &jnc (&label("done")); + &jnc (&label("generic")); &and ("edx",0xefffffff); &cmp ("edi",0); - &je (&label("done")); + &je (&label("generic")); &or ("edx",0x10000000); &shr ("ebx",16); &cmp (&LB("ebx"),1); - &ja (&label("done")); + &ja (&label("generic")); &and ("edx",0xefffffff); # clear hyper-threading bit if not + +&set_label("generic"); + &and ("ebp",1<<11); # isolate AMD XOP flag + &and ("ecx",0xfffff7ff); # force 11th bit to 0 + &mov ("esi","edx"); + &or ("ebp","ecx"); # merge AMD XOP flag + + &bt ("ecx",27); # check OSXSAVE bit + &jnc (&label("clear_avx")); + &xor ("ecx","ecx"); + &data_byte(0x0f,0x01,0xd0); # xgetbv + &and ("eax",6); + &cmp ("eax",6); + &je (&label("done")); + &cmp ("eax",2); + &je (&label("clear_avx")); +&set_label("clear_xmm"); + &and ("ebp",0xfdfffffd); # clear AESNI and PCLMULQDQ bits + &and ("esi",0xfeffffff); # clear FXSR +&set_label("clear_avx"); + &and ("ebp",0xefffe7ff); # clear AVX, FMA and AMD XOP bits &set_label("done"); - &mov ("eax","edx"); - &mov ("edx","ecx"); + &mov ("eax","esi"); + &mov ("edx","ebp"); +&set_label("nocpuid"); &function_end("OPENSSL_ia32_cpuid"); &external_label("OPENSSL_ia32cap_P"); @@ -199,8 +230,9 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &bt (&DWP(0,"ecx"),1); &jnc (&label("no_x87")); if ($sse2) { - &bt (&DWP(0,"ecx"),26); - &jnc (&label("no_sse2")); + &and ("ecx",1<<26|1<<24); # check SSE2 and FXSR bits + &cmp ("ecx",1<<26|1<<24); + &jne (&label("no_sse2")); &pxor ("xmm0","xmm0"); &pxor ("xmm1","xmm1"); &pxor ("xmm2","xmm2"); @@ -307,6 +339,18 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } &ret (); &function_end_B("OPENSSL_cleanse"); +&function_begin_B("OPENSSL_ia32_rdrand"); + &mov ("ecx",8); +&set_label("loop"); + &rdrand ("eax"); + &jc (&label("break")); + &loop (&label("loop")); +&set_label("break"); + &cmp ("eax",0); + &cmove ("eax","ecx"); + &ret (); +&function_end_B("OPENSSL_ia32_rdrand"); + &initseg("OPENSSL_cpuid_setup"); &asm_finish(); diff --git a/src/lib/libssl/bio_ssl.c b/src/lib/libssl/bio_ssl.c index eedac8a3fc..e9552caee2 100644 --- a/src/lib/libssl/bio_ssl.c +++ b/src/lib/libssl/bio_ssl.c @@ -538,6 +538,7 @@ err: BIO *BIO_new_ssl_connect(SSL_CTX *ctx) { +#ifndef OPENSSL_NO_SOCK BIO *ret=NULL,*con=NULL,*ssl=NULL; if ((con=BIO_new(BIO_s_connect())) == NULL) @@ -549,6 +550,7 @@ BIO *BIO_new_ssl_connect(SSL_CTX *ctx) return(ret); err: if (con != NULL) BIO_free(con); +#endif return(NULL); } diff --git a/src/lib/libssl/d1_both.c b/src/lib/libssl/d1_both.c index 9f898d6997..de8bab873f 100644 --- a/src/lib/libssl/d1_both.c +++ b/src/lib/libssl/d1_both.c @@ -227,14 +227,14 @@ int dtls1_do_write(SSL *s, int type) unsigned int len, frag_off, mac_size, blocksize; /* AHA! Figure out the MTU, and stick to the right size */ - if ( ! (SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) + if (s->d1->mtu < dtls1_min_mtu() && !(SSL_get_options(s) & SSL_OP_NO_QUERY_MTU)) { s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_QUERY_MTU, 0, NULL); /* I've seen the kernel return bogus numbers when it doesn't know * (initial write), so just make sure we have a reasonable number */ - if ( s->d1->mtu < dtls1_min_mtu()) + if (s->d1->mtu < dtls1_min_mtu()) { s->d1->mtu = 0; s->d1->mtu = dtls1_guess_mtu(s->d1->mtu); @@ -1084,7 +1084,11 @@ int dtls1_read_failed(SSL *s, int code) return code; } - if ( ! SSL_in_init(s)) /* done, no need to send a retransmit */ +#ifndef OPENSSL_NO_HEARTBEATS + if (!SSL_in_init(s) && !s->tlsext_hb_pending) /* done, no need to send a retransmit */ +#else + if (!SSL_in_init(s)) /* done, no need to send a retransmit */ +#endif { BIO_set_flags(SSL_get_rbio(s), BIO_FLAGS_READ); return code; @@ -1417,3 +1421,171 @@ dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr) ccs_hdr->type = *(data++); } + +int dtls1_shutdown(SSL *s) + { + int ret; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && + !(s->shutdown & SSL_SENT_SHUTDOWN)) + { + ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s)); + if (ret < 0) return -1; + + if (ret == 0) + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 1, NULL); + } +#endif + ret = ssl3_shutdown(s); +#ifndef OPENSSL_NO_SCTP + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SAVE_SHUTDOWN, 0, NULL); +#endif + return ret; + } + +#ifndef OPENSSL_NO_HEARTBEATS +int +dtls1_process_heartbeat(SSL *s) + { + unsigned char *p = &s->s3->rrec.data[0], *pl; + unsigned short hbtype; + unsigned int payload; + unsigned int padding = 16; /* Use minimum padding */ + + /* Read type and payload length first */ + hbtype = *p++; + n2s(p, payload); + pl = p; + + if (s->msg_callback) + s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT, + &s->s3->rrec.data[0], s->s3->rrec.length, + s, s->msg_callback_arg); + + if (hbtype == TLS1_HB_REQUEST) + { + unsigned char *buffer, *bp; + int r; + + /* Allocate memory for the response, size is 1 byte + * message type, plus 2 bytes payload length, plus + * payload, plus padding + */ + buffer = OPENSSL_malloc(1 + 2 + payload + padding); + bp = buffer; + + /* Enter response type, length and copy payload */ + *bp++ = TLS1_HB_RESPONSE; + s2n(payload, bp); + memcpy(bp, pl, payload); + bp += payload; + /* Random padding */ + RAND_pseudo_bytes(bp, padding); + + r = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding); + + if (r >= 0 && s->msg_callback) + s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, + buffer, 3 + payload + padding, + s, s->msg_callback_arg); + + OPENSSL_free(buffer); + + if (r < 0) + return r; + } + else if (hbtype == TLS1_HB_RESPONSE) + { + unsigned int seq; + + /* We only send sequence numbers (2 bytes unsigned int), + * and 16 random bytes, so we just try to read the + * sequence number */ + n2s(pl, seq); + + if (payload == 18 && seq == s->tlsext_hb_seq) + { + dtls1_stop_timer(s); + s->tlsext_hb_seq++; + s->tlsext_hb_pending = 0; + } + } + + return 0; + } + +int +dtls1_heartbeat(SSL *s) + { + unsigned char *buf, *p; + int ret; + unsigned int payload = 18; /* Sequence number + random bytes */ + unsigned int padding = 16; /* Use minimum padding */ + + /* Only send if peer supports and accepts HB requests... */ + if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) || + s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS) + { + SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT); + return -1; + } + + /* ...and there is none in flight yet... */ + if (s->tlsext_hb_pending) + { + SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING); + return -1; + } + + /* ...and no handshake in progress. */ + if (SSL_in_init(s) || s->in_handshake) + { + SSLerr(SSL_F_DTLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE); + return -1; + } + + /* Check if padding is too long, payload and padding + * must not exceed 2^14 - 3 = 16381 bytes in total. + */ + OPENSSL_assert(payload + padding <= 16381); + + /* Create HeartBeat message, we just use a sequence number + * as payload to distuingish different messages and add + * some random stuff. + * - Message Type, 1 byte + * - Payload Length, 2 bytes (unsigned int) + * - Payload, the sequence number (2 bytes uint) + * - Payload, random bytes (16 bytes uint) + * - Padding + */ + buf = OPENSSL_malloc(1 + 2 + payload + padding); + p = buf; + /* Message Type */ + *p++ = TLS1_HB_REQUEST; + /* Payload length (18 bytes here) */ + s2n(payload, p); + /* Sequence number */ + s2n(s->tlsext_hb_seq, p); + /* 16 random bytes */ + RAND_pseudo_bytes(p, 16); + p += 16; + /* Random padding */ + RAND_pseudo_bytes(p, padding); + + ret = dtls1_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding); + if (ret >= 0) + { + if (s->msg_callback) + s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, + buf, 3 + payload + padding, + s, s->msg_callback_arg); + + dtls1_start_timer(s); + s->tlsext_hb_pending = 1; + } + + OPENSSL_free(buf); + + return ret; + } +#endif diff --git a/src/lib/libssl/d1_clnt.c b/src/lib/libssl/d1_clnt.c index 089fa4c7f8..a6ed09c51d 100644 --- a/src/lib/libssl/d1_clnt.c +++ b/src/lib/libssl/d1_clnt.c @@ -150,7 +150,11 @@ int dtls1_connect(SSL *s) unsigned long Time=(unsigned long)time(NULL); void (*cb)(const SSL *ssl,int type,int val)=NULL; int ret= -1; - int new_state,state,skip=0;; + int new_state,state,skip=0; +#ifndef OPENSSL_NO_SCTP + unsigned char sctpauthkey[64]; + char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)]; +#endif RAND_add(&Time,sizeof(Time),0); ERR_clear_error(); @@ -164,6 +168,27 @@ int dtls1_connect(SSL *s) s->in_handshake++; if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); +#ifndef OPENSSL_NO_SCTP + /* Notify SCTP BIO socket to enter handshake + * mode and prevent stream identifier other + * than 0. Will be ignored if no SCTP is used. + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL); +#endif + +#ifndef OPENSSL_NO_HEARTBEATS + /* If we're awaiting a HeartbeatResponse, pretend we + * already got and don't await it anymore, because + * Heartbeats don't make sense during handshakes anyway. + */ + if (s->tlsext_hb_pending) + { + dtls1_stop_timer(s); + s->tlsext_hb_pending = 0; + s->tlsext_hb_seq++; + } +#endif + for (;;) { state=s->state; @@ -171,7 +196,7 @@ int dtls1_connect(SSL *s) switch(s->state) { case SSL_ST_RENEGOTIATE: - s->new_session=1; + s->renegotiate=1; s->state=SSL_ST_CONNECT; s->ctx->stats.sess_connect_renegotiate++; /* break */ @@ -226,6 +251,42 @@ int dtls1_connect(SSL *s) s->hit = 0; break; +#ifndef OPENSSL_NO_SCTP + case DTLS1_SCTP_ST_CR_READ_SOCK: + + if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s))) + { + s->s3->in_read_app_data=2; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + ret = -1; + goto end; + } + + s->state=s->s3->tmp.next_state; + break; + + case DTLS1_SCTP_ST_CW_WRITE_SOCK: + /* read app data until dry event */ + + ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s)); + if (ret < 0) goto end; + + if (ret == 0) + { + s->s3->in_read_app_data=2; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + ret = -1; + goto end; + } + + s->state=s->d1->next_state; + break; +#endif + case SSL3_ST_CW_CLNT_HELLO_A: case SSL3_ST_CW_CLNT_HELLO_B: @@ -248,9 +309,17 @@ int dtls1_connect(SSL *s) s->init_num=0; - /* turn on buffering for the next lot of output */ - if (s->bbio != s->wbio) - s->wbio=BIO_push(s->bbio,s->wbio); +#ifndef OPENSSL_NO_SCTP + /* Disable buffering for SCTP */ + if (!BIO_dgram_is_sctp(SSL_get_wbio(s))) + { +#endif + /* turn on buffering for the next lot of output */ + if (s->bbio != s->wbio) + s->wbio=BIO_push(s->bbio,s->wbio); +#ifndef OPENSSL_NO_SCTP + } +#endif break; @@ -260,9 +329,25 @@ int dtls1_connect(SSL *s) if (ret <= 0) goto end; else { - dtls1_stop_timer(s); if (s->hit) + { +#ifndef OPENSSL_NO_SCTP + /* Add new shared key for SCTP-Auth, + * will be ignored if no SCTP used. + */ + snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL), + DTLS1_SCTP_AUTH_LABEL); + + SSL_export_keying_material(s, sctpauthkey, + sizeof(sctpauthkey), labelbuffer, + sizeof(labelbuffer), NULL, 0, 0); + + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY, + sizeof(sctpauthkey), sctpauthkey); +#endif + s->state=SSL3_ST_CR_FINISHED_A; + } else s->state=DTLS1_ST_CR_HELLO_VERIFY_REQUEST_A; } @@ -354,12 +439,20 @@ int dtls1_connect(SSL *s) case SSL3_ST_CR_SRVR_DONE_B: ret=ssl3_get_server_done(s); if (ret <= 0) goto end; + dtls1_stop_timer(s); if (s->s3->tmp.cert_req) - s->state=SSL3_ST_CW_CERT_A; + s->s3->tmp.next_state=SSL3_ST_CW_CERT_A; else - s->state=SSL3_ST_CW_KEY_EXCH_A; + s->s3->tmp.next_state=SSL3_ST_CW_KEY_EXCH_A; s->init_num=0; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && + state == SSL_ST_RENEGOTIATE) + s->state=DTLS1_SCTP_ST_CR_READ_SOCK; + else +#endif + s->state=s->s3->tmp.next_state; break; case SSL3_ST_CW_CERT_A: @@ -378,6 +471,22 @@ int dtls1_connect(SSL *s) dtls1_start_timer(s); ret=dtls1_send_client_key_exchange(s); if (ret <= 0) goto end; + +#ifndef OPENSSL_NO_SCTP + /* Add new shared key for SCTP-Auth, + * will be ignored if no SCTP used. + */ + snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL), + DTLS1_SCTP_AUTH_LABEL); + + SSL_export_keying_material(s, sctpauthkey, + sizeof(sctpauthkey), labelbuffer, + sizeof(labelbuffer), NULL, 0, 0); + + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY, + sizeof(sctpauthkey), sctpauthkey); +#endif + /* EAY EAY EAY need to check for DH fix cert * sent back */ /* For TLS, cert_req is set to 2, so a cert chain @@ -388,7 +497,15 @@ int dtls1_connect(SSL *s) } else { - s->state=SSL3_ST_CW_CHANGE_A; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state=SSL3_ST_CW_CHANGE_A; + s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK; + } + else +#endif + s->state=SSL3_ST_CW_CHANGE_A; s->s3->change_cipher_spec=0; } @@ -400,7 +517,15 @@ int dtls1_connect(SSL *s) dtls1_start_timer(s); ret=dtls1_send_client_verify(s); if (ret <= 0) goto end; - s->state=SSL3_ST_CW_CHANGE_A; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state=SSL3_ST_CW_CHANGE_A; + s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK; + } + else +#endif + s->state=SSL3_ST_CW_CHANGE_A; s->init_num=0; s->s3->change_cipher_spec=0; break; @@ -412,6 +537,14 @@ int dtls1_connect(SSL *s) ret=dtls1_send_change_cipher_spec(s, SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B); if (ret <= 0) goto end; + +#ifndef OPENSSL_NO_SCTP + /* Change to new shared key of SCTP-Auth, + * will be ignored if no SCTP used. + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL); +#endif + s->state=SSL3_ST_CW_FINISHED_A; s->init_num=0; @@ -457,9 +590,23 @@ int dtls1_connect(SSL *s) if (s->hit) { s->s3->tmp.next_state=SSL_ST_OK; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state = s->s3->tmp.next_state; + s->s3->tmp.next_state=DTLS1_SCTP_ST_CW_WRITE_SOCK; + } +#endif if (s->s3->flags & SSL3_FLAGS_DELAY_CLIENT_FINISHED) { s->state=SSL_ST_OK; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state = SSL_ST_OK; + s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK; + } +#endif s->s3->flags|=SSL3_FLAGS_POP_BUFFER; s->s3->delay_buf_pop_ret=0; } @@ -508,6 +655,16 @@ int dtls1_connect(SSL *s) s->state=SSL3_ST_CW_CHANGE_A; else s->state=SSL_ST_OK; + +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && + state == SSL_ST_RENEGOTIATE) + { + s->d1->next_state=s->state; + s->state=DTLS1_SCTP_ST_CW_WRITE_SOCK; + } +#endif + s->init_num=0; break; @@ -515,6 +672,13 @@ int dtls1_connect(SSL *s) s->rwstate=SSL_WRITING; if (BIO_flush(s->wbio) <= 0) { + /* If the write error was fatal, stop trying */ + if (!BIO_should_retry(s->wbio)) + { + s->rwstate=SSL_NOTHING; + s->state=s->s3->tmp.next_state; + } + ret= -1; goto end; } @@ -541,6 +705,7 @@ int dtls1_connect(SSL *s) /* else do it later in ssl3_write */ s->init_num=0; + s->renegotiate=0; s->new_session=0; ssl_update_cache(s,SSL_SESS_CACHE_CLIENT); @@ -587,6 +752,15 @@ int dtls1_connect(SSL *s) } end: s->in_handshake--; + +#ifndef OPENSSL_NO_SCTP + /* Notify SCTP BIO socket to leave handshake + * mode and allow stream identifier other + * than 0. Will be ignored if no SCTP is used. + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL); +#endif + if (buf != NULL) BUF_MEM_free(buf); if (cb != NULL) diff --git a/src/lib/libssl/d1_enc.c b/src/lib/libssl/d1_enc.c index becbab91c2..07a5e97ce5 100644 --- a/src/lib/libssl/d1_enc.c +++ b/src/lib/libssl/d1_enc.c @@ -260,7 +260,7 @@ int dtls1_enc(SSL *s, int send) } /* TLS 1.0 does not bound the number of padding bytes by the block size. * All of them must have value 'padding_length'. */ - if (i > (int)rec->length) + if (i + bs > (int)rec->length) { /* Incorrect padding. SSLerr() and ssl3_alert are done * by caller: we don't want to reveal whether this is diff --git a/src/lib/libssl/d1_lib.c b/src/lib/libssl/d1_lib.c index c3b77c889b..f61f718183 100644 --- a/src/lib/libssl/d1_lib.c +++ b/src/lib/libssl/d1_lib.c @@ -82,6 +82,7 @@ SSL3_ENC_METHOD DTLSv1_enc_data={ TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, tls1_alert_code, + tls1_export_keying_material, }; long dtls1_default_timeout(void) @@ -291,6 +292,15 @@ const SSL_CIPHER *dtls1_get_cipher(unsigned int u) void dtls1_start_timer(SSL *s) { +#ifndef OPENSSL_NO_SCTP + /* Disable timer for SCTP */ + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); + return; + } +#endif + /* If timer is not set, initialize duration with 1 second */ if (s->d1->next_timeout.tv_sec == 0 && s->d1->next_timeout.tv_usec == 0) { @@ -381,6 +391,7 @@ void dtls1_double_timeout(SSL *s) void dtls1_stop_timer(SSL *s) { /* Reset everything */ + memset(&(s->d1->timeout), 0, sizeof(struct dtls1_timeout_st)); memset(&(s->d1->next_timeout), 0, sizeof(struct timeval)); s->d1->timeout_duration = 1; BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SET_NEXT_TIMEOUT, 0, &(s->d1->next_timeout)); @@ -388,10 +399,28 @@ void dtls1_stop_timer(SSL *s) dtls1_clear_record_buffer(s); } -int dtls1_handle_timeout(SSL *s) +int dtls1_check_timeout_num(SSL *s) { - DTLS1_STATE *state; + s->d1->timeout.num_alerts++; + + /* Reduce MTU after 2 unsuccessful retransmissions */ + if (s->d1->timeout.num_alerts > 2) + { + s->d1->mtu = BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_GET_FALLBACK_MTU, 0, NULL); + } + if (s->d1->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) + { + /* fail the connection, enough alerts have been sent */ + SSLerr(SSL_F_DTLS1_CHECK_TIMEOUT_NUM,SSL_R_READ_TIMEOUT_EXPIRED); + return -1; + } + + return 0; + } + +int dtls1_handle_timeout(SSL *s) + { /* if no timer is expired, don't do anything */ if (!dtls1_is_timer_expired(s)) { @@ -399,20 +428,23 @@ int dtls1_handle_timeout(SSL *s) } dtls1_double_timeout(s); - state = s->d1; - state->timeout.num_alerts++; - if ( state->timeout.num_alerts > DTLS1_TMO_ALERT_COUNT) - { - /* fail the connection, enough alerts have been sent */ - SSLerr(SSL_F_DTLS1_HANDLE_TIMEOUT,SSL_R_READ_TIMEOUT_EXPIRED); + + if (dtls1_check_timeout_num(s) < 0) return -1; + + s->d1->timeout.read_timeouts++; + if (s->d1->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) + { + s->d1->timeout.read_timeouts = 1; } - state->timeout.read_timeouts++; - if ( state->timeout.read_timeouts > DTLS1_TMO_READ_COUNT) +#ifndef OPENSSL_NO_HEARTBEATS + if (s->tlsext_hb_pending) { - state->timeout.read_timeouts = 1; + s->tlsext_hb_pending = 0; + return dtls1_heartbeat(s); } +#endif dtls1_start_timer(s); return dtls1_retransmit_buffered_messages(s); diff --git a/src/lib/libssl/d1_pkt.c b/src/lib/libssl/d1_pkt.c index e0c0f0cc9a..987af60835 100644 --- a/src/lib/libssl/d1_pkt.c +++ b/src/lib/libssl/d1_pkt.c @@ -179,7 +179,6 @@ static int dtls1_record_needs_buffering(SSL *s, SSL3_RECORD *rr, static int dtls1_buffer_record(SSL *s, record_pqueue *q, unsigned char *priority); static int dtls1_process_record(SSL *s); -static void dtls1_clear_timeouts(SSL *s); /* copy buffered record into SSL structure */ static int @@ -232,6 +231,14 @@ dtls1_buffer_record(SSL *s, record_pqueue *queue, unsigned char *priority) item->data = rdata; +#ifndef OPENSSL_NO_SCTP + /* Store bio_dgram_sctp_rcvinfo struct */ + if (BIO_dgram_is_sctp(SSL_get_rbio(s)) && + (s->state == SSL3_ST_SR_FINISHED_A || s->state == SSL3_ST_CR_FINISHED_A)) { + BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_GET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo); + } +#endif + /* insert should not fail, since duplicates are dropped */ if (pqueue_insert(queue->q, item) == NULL) { @@ -376,6 +383,7 @@ dtls1_process_record(SSL *s) unsigned int mac_size; unsigned char md[EVP_MAX_MD_SIZE]; int decryption_failed_or_bad_record_mac = 0; + unsigned char *mac = NULL; rr= &(s->s3->rrec); @@ -447,19 +455,15 @@ printf("\n"); #endif } /* check the MAC for rr->input (it's in mac_size bytes at the tail) */ - if (rr->length < mac_size) + if (rr->length >= mac_size) { -#if 0 /* OK only for stream ciphers */ - al=SSL_AD_DECODE_ERROR; - SSLerr(SSL_F_DTLS1_PROCESS_RECORD,SSL_R_LENGTH_TOO_SHORT); - goto f_err; -#else - decryption_failed_or_bad_record_mac = 1; -#endif + rr->length -= mac_size; + mac = &rr->data[rr->length]; } - rr->length-=mac_size; + else + rr->length = 0; i=s->method->ssl3_enc->mac(s,md,0); - if (i < 0 || memcmp(md,&(rr->data[rr->length]),mac_size) != 0) + if (i < 0 || mac == NULL || memcmp(md, mac, mac_size) != 0) { decryption_failed_or_bad_record_mac = 1; } @@ -644,20 +648,28 @@ again: goto again; /* get another record */ } - /* Check whether this is a repeat, or aged record. - * Don't check if we're listening and this message is - * a ClientHello. They can look as if they're replayed, - * since they arrive from different connections and - * would be dropped unnecessarily. - */ - if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE && - *p == SSL3_MT_CLIENT_HELLO) && - !dtls1_record_replay_check(s, bitmap)) - { - rr->length = 0; - s->packet_length=0; /* dump this record */ - goto again; /* get another record */ - } +#ifndef OPENSSL_NO_SCTP + /* Only do replay check if no SCTP bio */ + if (!BIO_dgram_is_sctp(SSL_get_rbio(s))) + { +#endif + /* Check whether this is a repeat, or aged record. + * Don't check if we're listening and this message is + * a ClientHello. They can look as if they're replayed, + * since they arrive from different connections and + * would be dropped unnecessarily. + */ + if (!(s->d1->listen && rr->type == SSL3_RT_HANDSHAKE && + *p == SSL3_MT_CLIENT_HELLO) && + !dtls1_record_replay_check(s, bitmap)) + { + rr->length = 0; + s->packet_length=0; /* dump this record */ + goto again; /* get another record */ + } +#ifndef OPENSSL_NO_SCTP + } +#endif /* just read a 0 length packet */ if (rr->length == 0) goto again; @@ -685,7 +697,6 @@ again: goto again; /* get another record */ } - dtls1_clear_timeouts(s); /* done waiting */ return(1); } @@ -743,7 +754,17 @@ int dtls1_read_bytes(SSL *s, int type, unsigned char *buf, int len, int peek) /* Now s->d1->handshake_fragment_len == 0 if type == SSL3_RT_HANDSHAKE. */ +#ifndef OPENSSL_NO_SCTP + /* Continue handshake if it had to be interrupted to read + * app data with SCTP. + */ + if ((!s->in_handshake && SSL_in_init(s)) || + (BIO_dgram_is_sctp(SSL_get_rbio(s)) && + (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK) && + s->s3->in_read_app_data != 2)) +#else if (!s->in_handshake && SSL_in_init(s)) +#endif { /* type == SSL3_RT_APPLICATION_DATA */ i=s->handshake_func(s); @@ -774,6 +795,15 @@ start: item = pqueue_pop(s->d1->buffered_app_data.q); if (item) { +#ifndef OPENSSL_NO_SCTP + /* Restore bio_dgram_sctp_rcvinfo struct */ + if (BIO_dgram_is_sctp(SSL_get_rbio(s))) + { + DTLS1_RECORD_DATA *rdata = (DTLS1_RECORD_DATA *) item->data; + BIO_ctrl(SSL_get_rbio(s), BIO_CTRL_DGRAM_SCTP_SET_RCVINFO, sizeof(rdata->recordinfo), &rdata->recordinfo); + } +#endif + dtls1_copy_record(s, item); OPENSSL_free(item->data); @@ -856,6 +886,31 @@ start: rr->off=0; } } + +#ifndef OPENSSL_NO_SCTP + /* We were about to renegotiate but had to read + * belated application data first, so retry. + */ + if (BIO_dgram_is_sctp(SSL_get_rbio(s)) && + rr->type == SSL3_RT_APPLICATION_DATA && + (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK)) + { + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + } + + /* We might had to delay a close_notify alert because + * of reordered app data. If there was an alert and there + * is no message to read anymore, finally set shutdown. + */ + if (BIO_dgram_is_sctp(SSL_get_rbio(s)) && + s->d1->shutdown_received && !BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s))) + { + s->shutdown |= SSL_RECEIVED_SHUTDOWN; + return(0); + } +#endif return(n); } @@ -883,6 +938,19 @@ start: dest = s->d1->alert_fragment; dest_len = &s->d1->alert_fragment_len; } +#ifndef OPENSSL_NO_HEARTBEATS + else if (rr->type == TLS1_RT_HEARTBEAT) + { + dtls1_process_heartbeat(s); + + /* Exit and notify application to read again */ + rr->length = 0; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + return(-1); + } +#endif /* else it's a CCS message, or application data or wrong */ else if (rr->type != SSL3_RT_CHANGE_CIPHER_SPEC) { @@ -966,6 +1034,7 @@ start: !(s->s3->flags & SSL3_FLAGS_NO_RENEGOTIATE_CIPHERS) && !s->s3->renegotiate) { + s->new_session = 1; ssl3_renegotiate(s); if (ssl3_renegotiate_check(s)) { @@ -1027,6 +1096,21 @@ start: s->s3->warn_alert = alert_descr; if (alert_descr == SSL_AD_CLOSE_NOTIFY) { +#ifndef OPENSSL_NO_SCTP + /* With SCTP and streams the socket may deliver app data + * after a close_notify alert. We have to check this + * first so that nothing gets discarded. + */ + if (BIO_dgram_is_sctp(SSL_get_rbio(s)) && + BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s))) + { + s->d1->shutdown_received = 1; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + return -1; + } +#endif s->shutdown |= SSL_RECEIVED_SHUTDOWN; return(0); } @@ -1133,6 +1217,15 @@ start: if (s->version == DTLS1_BAD_VER) s->d1->handshake_read_seq++; +#ifndef OPENSSL_NO_SCTP + /* Remember that a CCS has been received, + * so that an old key of SCTP-Auth can be + * deleted when a CCS is sent. Will be ignored + * if no SCTP is used + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_AUTH_CCS_RCVD, 1, NULL); +#endif + goto start; } @@ -1155,6 +1248,9 @@ start: */ if (msg_hdr.type == SSL3_MT_FINISHED) { + if (dtls1_check_timeout_num(s) < 0) + return -1; + dtls1_retransmit_buffered_messages(s); rr->length = 0; goto start; @@ -1172,6 +1268,7 @@ start: #else s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT; #endif + s->renegotiate=1; s->new_session=1; } i=s->handshake_func(s); @@ -1268,7 +1365,16 @@ dtls1_write_app_data_bytes(SSL *s, int type, const void *buf_, int len) { int i; - if (SSL_in_init(s) && !s->in_handshake) +#ifndef OPENSSL_NO_SCTP + /* Check if we have to continue an interrupted handshake + * for reading belated app data with SCTP. + */ + if ((SSL_in_init(s) && !s->in_handshake) || + (BIO_dgram_is_sctp(SSL_get_wbio(s)) && + (s->state == DTLS1_SCTP_ST_SR_READ_SOCK || s->state == DTLS1_SCTP_ST_CR_READ_SOCK))) +#else + if (SSL_in_init(s) && !s->in_handshake) +#endif { i=s->handshake_func(s); if (i < 0) return(i); @@ -1768,10 +1874,3 @@ dtls1_reset_seq_numbers(SSL *s, int rw) memset(seq, 0x00, seq_bytes); } - - -static void -dtls1_clear_timeouts(SSL *s) - { - memset(&(s->d1->timeout), 0x00, sizeof(struct dtls1_timeout_st)); - } diff --git a/src/lib/libssl/d1_srtp.c b/src/lib/libssl/d1_srtp.c new file mode 100644 index 0000000000..928935bd8b --- /dev/null +++ b/src/lib/libssl/d1_srtp.c @@ -0,0 +1,493 @@ +/* ssl/t1_lib.c */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ +/* ==================================================================== + * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* + DTLS code by Eric Rescorla + + Copyright (C) 2006, Network Resonance, Inc. + Copyright (C) 2011, RTFM, Inc. +*/ + +#ifndef OPENSSL_NO_SRTP + +#include +#include +#include "ssl_locl.h" +#include "srtp.h" + + +static SRTP_PROTECTION_PROFILE srtp_known_profiles[]= + { + { + "SRTP_AES128_CM_SHA1_80", + SRTP_AES128_CM_SHA1_80, + }, + { + "SRTP_AES128_CM_SHA1_32", + SRTP_AES128_CM_SHA1_32, + }, +#if 0 + { + "SRTP_NULL_SHA1_80", + SRTP_NULL_SHA1_80, + }, + { + "SRTP_NULL_SHA1_32", + SRTP_NULL_SHA1_32, + }, +#endif + {0} + }; + +static int find_profile_by_name(char *profile_name, + SRTP_PROTECTION_PROFILE **pptr,unsigned len) + { + SRTP_PROTECTION_PROFILE *p; + + p=srtp_known_profiles; + while(p->name) + { + if((len == strlen(p->name)) && !strncmp(p->name,profile_name, + len)) + { + *pptr=p; + return 0; + } + + p++; + } + + return 1; + } + +static int find_profile_by_num(unsigned profile_num, + SRTP_PROTECTION_PROFILE **pptr) + { + SRTP_PROTECTION_PROFILE *p; + + p=srtp_known_profiles; + while(p->name) + { + if(p->id == profile_num) + { + *pptr=p; + return 0; + } + p++; + } + + return 1; + } + +static int ssl_ctx_make_profiles(const char *profiles_string,STACK_OF(SRTP_PROTECTION_PROFILE) **out) + { + STACK_OF(SRTP_PROTECTION_PROFILE) *profiles; + + char *col; + char *ptr=(char *)profiles_string; + + SRTP_PROTECTION_PROFILE *p; + + if(!(profiles=sk_SRTP_PROTECTION_PROFILE_new_null())) + { + SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES, SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES); + return 1; + } + + do + { + col=strchr(ptr,':'); + + if(!find_profile_by_name(ptr,&p, + col ? col-ptr : (int)strlen(ptr))) + { + sk_SRTP_PROTECTION_PROFILE_push(profiles,p); + } + else + { + SSLerr(SSL_F_SSL_CTX_MAKE_PROFILES,SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE); + return 1; + } + + if(col) ptr=col+1; + } while (col); + + *out=profiles; + + return 0; + } + +int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx,const char *profiles) + { + return ssl_ctx_make_profiles(profiles,&ctx->srtp_profiles); + } + +int SSL_set_tlsext_use_srtp(SSL *s,const char *profiles) + { + return ssl_ctx_make_profiles(profiles,&s->srtp_profiles); + } + + +STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *s) + { + if(s != NULL) + { + if(s->srtp_profiles != NULL) + { + return s->srtp_profiles; + } + else if((s->ctx != NULL) && + (s->ctx->srtp_profiles != NULL)) + { + return s->ctx->srtp_profiles; + } + } + + return NULL; + } + +SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s) + { + return s->srtp_profile; + } + +/* Note: this function returns 0 length if there are no + profiles specified */ +int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) + { + int ct=0; + int i; + STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0; + SRTP_PROTECTION_PROFILE *prof; + + clnt=SSL_get_srtp_profiles(s); + ct=sk_SRTP_PROTECTION_PROFILE_num(clnt); /* -1 if clnt == 0 */ + + if(p) + { + if(ct==0) + { + SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST); + return 1; + } + + if((2 + ct*2 + 1) > maxlen) + { + SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); + return 1; + } + + /* Add the length */ + s2n(ct * 2, p); + for(i=0;iid,p); + } + + /* Add an empty use_mki value */ + *p++ = 0; + } + + *len=2 + ct*2 + 1; + + return 0; + } + + +int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) + { + SRTP_PROTECTION_PROFILE *cprof,*sprof; + STACK_OF(SRTP_PROTECTION_PROFILE) *clnt=0,*srvr; + int ct; + int mki_len; + int i,j; + int id; + int ret; + + /* Length value + the MKI length */ + if(len < 3) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + /* Pull off the length of the cipher suite list */ + n2s(d, ct); + len -= 2; + + /* Check that it is even */ + if(ct%2) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + /* Check that lengths are consistent */ + if(len < (ct + 1)) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + + clnt=sk_SRTP_PROTECTION_PROFILE_new_null(); + + while(ct) + { + n2s(d,id); + ct-=2; + len-=2; + + if(!find_profile_by_num(id,&cprof)) + { + sk_SRTP_PROTECTION_PROFILE_push(clnt,cprof); + } + else + { + ; /* Ignore */ + } + } + + /* Now extract the MKI value as a sanity check, but discard it for now */ + mki_len = *d; + d++; len--; + + if (mki_len != len) + { + SSLerr(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + srvr=SSL_get_srtp_profiles(s); + + /* Pick our most preferred profile. If no profiles have been + configured then the outer loop doesn't run + (sk_SRTP_PROTECTION_PROFILE_num() = -1) + and so we just return without doing anything */ + for(i=0;iid==sprof->id) + { + s->srtp_profile=sprof; + *al=0; + ret=0; + goto done; + } + } + } + + ret=0; + +done: + if(clnt) sk_SRTP_PROTECTION_PROFILE_free(clnt); + + return ret; + } + +int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen) + { + if(p) + { + if(maxlen < 5) + { + SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG); + return 1; + } + + if(s->srtp_profile==0) + { + SSLerr(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT,SSL_R_USE_SRTP_NOT_NEGOTIATED); + return 1; + } + s2n(2, p); + s2n(s->srtp_profile->id,p); + *p++ = 0; + } + *len=5; + + return 0; + } + + +int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al) + { + unsigned id; + int i; + int ct; + + STACK_OF(SRTP_PROTECTION_PROFILE) *clnt; + SRTP_PROTECTION_PROFILE *prof; + + if(len!=5) + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + n2s(d, ct); + if(ct!=2) + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + n2s(d,id); + if (*d) /* Must be no MKI, since we never offer one */ + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_MKI_VALUE); + *al=SSL_AD_ILLEGAL_PARAMETER; + return 1; + } + + clnt=SSL_get_srtp_profiles(s); + + /* Throw an error if the server gave us an unsolicited extension */ + if (clnt == NULL) + { + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_NO_SRTP_PROFILES); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + /* Check to see if the server gave us something we support + (and presumably offered) + */ + for(i=0;iid == id) + { + s->srtp_profile=prof; + *al=0; + return 0; + } + } + + SSLerr(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT,SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST); + *al=SSL_AD_DECODE_ERROR; + return 1; + } + + +#endif diff --git a/src/lib/libssl/d1_srvr.c b/src/lib/libssl/d1_srvr.c index 149983be30..29421da9aa 100644 --- a/src/lib/libssl/d1_srvr.c +++ b/src/lib/libssl/d1_srvr.c @@ -151,6 +151,10 @@ int dtls1_accept(SSL *s) int ret= -1; int new_state,state,skip=0; int listen; +#ifndef OPENSSL_NO_SCTP + unsigned char sctpauthkey[64]; + char labelbuffer[sizeof(DTLS1_SCTP_AUTH_LABEL)]; +#endif RAND_add(&Time,sizeof(Time),0); ERR_clear_error(); @@ -168,6 +172,13 @@ int dtls1_accept(SSL *s) if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); s->d1->listen = listen; +#ifndef OPENSSL_NO_SCTP + /* Notify SCTP BIO socket to enter handshake + * mode and prevent stream identifier other + * than 0. Will be ignored if no SCTP is used. + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL); +#endif if (s->cert == NULL) { @@ -175,6 +186,19 @@ int dtls1_accept(SSL *s) return(-1); } +#ifndef OPENSSL_NO_HEARTBEATS + /* If we're awaiting a HeartbeatResponse, pretend we + * already got and don't await it anymore, because + * Heartbeats don't make sense during handshakes anyway. + */ + if (s->tlsext_hb_pending) + { + dtls1_stop_timer(s); + s->tlsext_hb_pending = 0; + s->tlsext_hb_seq++; + } +#endif + for (;;) { state=s->state; @@ -182,7 +206,7 @@ int dtls1_accept(SSL *s) switch (s->state) { case SSL_ST_RENEGOTIATE: - s->new_session=1; + s->renegotiate=1; /* s->state=SSL_ST_ACCEPT; */ case SSL_ST_BEFORE: @@ -227,8 +251,12 @@ int dtls1_accept(SSL *s) { /* Ok, we now need to push on a buffering BIO so that * the output is sent in a way that TCP likes :-) + * ...but not with SCTP :-) */ - if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; } +#ifndef OPENSSL_NO_SCTP + if (!BIO_dgram_is_sctp(SSL_get_wbio(s))) +#endif + if (!ssl_init_wbio_buffer(s,1)) { ret= -1; goto end; } ssl3_init_finished_mac(s); s->state=SSL3_ST_SR_CLNT_HELLO_A; @@ -313,25 +341,75 @@ int dtls1_accept(SSL *s) ssl3_init_finished_mac(s); break; +#ifndef OPENSSL_NO_SCTP + case DTLS1_SCTP_ST_SR_READ_SOCK: + + if (BIO_dgram_sctp_msg_waiting(SSL_get_rbio(s))) + { + s->s3->in_read_app_data=2; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + ret = -1; + goto end; + } + + s->state=SSL3_ST_SR_FINISHED_A; + break; + + case DTLS1_SCTP_ST_SW_WRITE_SOCK: + ret = BIO_dgram_sctp_wait_for_dry(SSL_get_wbio(s)); + if (ret < 0) goto end; + + if (ret == 0) + { + if (s->d1->next_state != SSL_ST_OK) + { + s->s3->in_read_app_data=2; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + ret = -1; + goto end; + } + } + + s->state=s->d1->next_state; + break; +#endif + case SSL3_ST_SW_SRVR_HELLO_A: case SSL3_ST_SW_SRVR_HELLO_B: - s->new_session = 2; + s->renegotiate = 2; dtls1_start_timer(s); ret=dtls1_send_server_hello(s); if (ret <= 0) goto end; -#ifndef OPENSSL_NO_TLSEXT if (s->hit) { +#ifndef OPENSSL_NO_SCTP + /* Add new shared key for SCTP-Auth, + * will be ignored if no SCTP used. + */ + snprintf((char*) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL), + DTLS1_SCTP_AUTH_LABEL); + + SSL_export_keying_material(s, sctpauthkey, + sizeof(sctpauthkey), labelbuffer, + sizeof(labelbuffer), NULL, 0, 0); + + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY, + sizeof(sctpauthkey), sctpauthkey); +#endif +#ifndef OPENSSL_NO_TLSEXT if (s->tlsext_ticket_expected) s->state=SSL3_ST_SW_SESSION_TICKET_A; else s->state=SSL3_ST_SW_CHANGE_A; - } #else - if (s->hit) - s->state=SSL3_ST_SW_CHANGE_A; + s->state=SSL3_ST_SW_CHANGE_A; #endif + } else s->state=SSL3_ST_SW_CERT_A; s->init_num=0; @@ -441,6 +519,13 @@ int dtls1_accept(SSL *s) skip=1; s->s3->tmp.cert_request=0; s->state=SSL3_ST_SW_SRVR_DONE_A; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A; + s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK; + } +#endif } else { @@ -450,9 +535,23 @@ int dtls1_accept(SSL *s) if (ret <= 0) goto end; #ifndef NETSCAPE_HANG_BUG s->state=SSL3_ST_SW_SRVR_DONE_A; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state = SSL3_ST_SW_SRVR_DONE_A; + s->state = DTLS1_SCTP_ST_SW_WRITE_SOCK; + } +#endif #else s->state=SSL3_ST_SW_FLUSH; s->s3->tmp.next_state=SSL3_ST_SR_CERT_A; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state = s->s3->tmp.next_state; + s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK; + } +#endif #endif s->init_num=0; } @@ -472,6 +571,13 @@ int dtls1_accept(SSL *s) s->rwstate=SSL_WRITING; if (BIO_flush(s->wbio) <= 0) { + /* If the write error was fatal, stop trying */ + if (!BIO_should_retry(s->wbio)) + { + s->rwstate=SSL_NOTHING; + s->state=s->s3->tmp.next_state; + } + ret= -1; goto end; } @@ -485,15 +591,16 @@ int dtls1_accept(SSL *s) ret = ssl3_check_client_hello(s); if (ret <= 0) goto end; - dtls1_stop_timer(s); if (ret == 2) + { + dtls1_stop_timer(s); s->state = SSL3_ST_SR_CLNT_HELLO_C; + } else { /* could be sent for a DH cert, even if we * have not asked for it :-) */ ret=ssl3_get_client_certificate(s); if (ret <= 0) goto end; - dtls1_stop_timer(s); s->init_num=0; s->state=SSL3_ST_SR_KEY_EXCH_A; } @@ -503,7 +610,21 @@ int dtls1_accept(SSL *s) case SSL3_ST_SR_KEY_EXCH_B: ret=ssl3_get_client_key_exchange(s); if (ret <= 0) goto end; - dtls1_stop_timer(s); +#ifndef OPENSSL_NO_SCTP + /* Add new shared key for SCTP-Auth, + * will be ignored if no SCTP used. + */ + snprintf((char *) labelbuffer, sizeof(DTLS1_SCTP_AUTH_LABEL), + DTLS1_SCTP_AUTH_LABEL); + + SSL_export_keying_material(s, sctpauthkey, + sizeof(sctpauthkey), labelbuffer, + sizeof(labelbuffer), NULL, 0, 0); + + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_ADD_AUTH_KEY, + sizeof(sctpauthkey), sctpauthkey); +#endif + s->state=SSL3_ST_SR_CERT_VRFY_A; s->init_num=0; @@ -540,9 +661,13 @@ int dtls1_accept(SSL *s) /* we should decide if we expected this one */ ret=ssl3_get_cert_verify(s); if (ret <= 0) goto end; - dtls1_stop_timer(s); - - s->state=SSL3_ST_SR_FINISHED_A; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s)) && + state == SSL_ST_RENEGOTIATE) + s->state=DTLS1_SCTP_ST_SR_READ_SOCK; + else +#endif + s->state=SSL3_ST_SR_FINISHED_A; s->init_num=0; break; @@ -594,6 +719,14 @@ int dtls1_accept(SSL *s) SSL3_ST_SW_CHANGE_A,SSL3_ST_SW_CHANGE_B); if (ret <= 0) goto end; + +#ifndef OPENSSL_NO_SCTP + /* Change to new shared key of SCTP-Auth, + * will be ignored if no SCTP used. + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_NEXT_AUTH_KEY, 0, NULL); +#endif + s->state=SSL3_ST_SW_FINISHED_A; s->init_num=0; @@ -618,7 +751,16 @@ int dtls1_accept(SSL *s) if (s->hit) s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A; else + { s->s3->tmp.next_state=SSL_ST_OK; +#ifndef OPENSSL_NO_SCTP + if (BIO_dgram_is_sctp(SSL_get_wbio(s))) + { + s->d1->next_state = s->s3->tmp.next_state; + s->s3->tmp.next_state=DTLS1_SCTP_ST_SW_WRITE_SOCK; + } +#endif + } s->init_num=0; break; @@ -636,11 +778,9 @@ int dtls1_accept(SSL *s) s->init_num=0; - if (s->new_session == 2) /* skipped if we just sent a HelloRequest */ + if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */ { - /* actually not necessarily a 'new' session unless - * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */ - + s->renegotiate=0; s->new_session=0; ssl_update_cache(s,SSL_SESS_CACHE_SERVER); @@ -692,6 +832,14 @@ end: /* BIO_flush(s->wbio); */ s->in_handshake--; +#ifndef OPENSSL_NO_SCTP + /* Notify SCTP BIO socket to leave handshake + * mode and prevent stream identifier other + * than 0. Will be ignored if no SCTP is used. + */ + BIO_ctrl(SSL_get_wbio(s), BIO_CTRL_DGRAM_SCTP_SET_IN_HANDSHAKE, s->in_handshake, NULL); +#endif + if (cb != NULL) cb(s,SSL_CB_ACCEPT_EXIT,ret); return(ret); @@ -772,7 +920,7 @@ int dtls1_send_server_hello(SSL *s) p=s->s3->server_random; Time=(unsigned long)time(NULL); /* Time */ l2n(Time,p); - RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-sizeof(Time)); + RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4); /* Do the message type and length last */ d=p= &(buf[DTLS1_HM_HEADER_LENGTH]); @@ -1147,7 +1295,7 @@ int dtls1_send_server_key_exchange(SSL *s) if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL) && !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)) { - if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher)) + if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher, NULL)) == NULL) { al=SSL_AD_DECODE_ERROR; diff --git a/src/lib/libssl/dtls1.h b/src/lib/libssl/dtls1.h index 2900d1d8ae..5008bf6081 100644 --- a/src/lib/libssl/dtls1.h +++ b/src/lib/libssl/dtls1.h @@ -105,6 +105,11 @@ extern "C" { #define DTLS1_AL_HEADER_LENGTH 2 #endif +#ifndef OPENSSL_NO_SSL_INTERN + +#ifndef OPENSSL_NO_SCTP +#define DTLS1_SCTP_AUTH_LABEL "EXPORTER_DTLS_OVER_SCTP" +#endif typedef struct dtls1_bitmap_st { @@ -227,7 +232,7 @@ typedef struct dtls1_state_st struct dtls1_timeout_st timeout; - /* Indicates when the last handshake msg sent will timeout */ + /* Indicates when the last handshake msg or heartbeat sent will timeout */ struct timeval next_timeout; /* Timeout duration */ @@ -243,6 +248,13 @@ typedef struct dtls1_state_st unsigned int retransmitting; unsigned int change_cipher_spec_ok; +#ifndef OPENSSL_NO_SCTP + /* used when SSL_ST_XX_FLUSH is entered */ + int next_state; + + int shutdown_received; +#endif + } DTLS1_STATE; typedef struct dtls1_record_data_st @@ -251,8 +263,12 @@ typedef struct dtls1_record_data_st unsigned int packet_length; SSL3_BUFFER rbuf; SSL3_RECORD rrec; +#ifndef OPENSSL_NO_SCTP + struct bio_dgram_sctp_rcvinfo recordinfo; +#endif } DTLS1_RECORD_DATA; +#endif /* Timeout multipliers (timeout slice is defined in apps/timeouts.h */ #define DTLS1_TMO_READ_COUNT 2 diff --git a/src/lib/libssl/s23_clnt.c b/src/lib/libssl/s23_clnt.c index c4d8bf2eb3..47673e740a 100644 --- a/src/lib/libssl/s23_clnt.c +++ b/src/lib/libssl/s23_clnt.c @@ -129,6 +129,10 @@ static const SSL_METHOD *ssl23_get_client_method(int ver) return(SSLv3_client_method()); else if (ver == TLS1_VERSION) return(TLSv1_client_method()); + else if (ver == TLS1_1_VERSION) + return(TLSv1_1_client_method()); + else if (ver == TLS1_2_VERSION) + return(TLSv1_2_client_method()); else return(NULL); } @@ -278,24 +282,51 @@ static int ssl23_client_hello(SSL *s) SSL_COMP *comp; #endif int ret; + unsigned long mask, options = s->options; - ssl2_compat = (s->options & SSL_OP_NO_SSLv2) ? 0 : 1; + ssl2_compat = (options & SSL_OP_NO_SSLv2) ? 0 : 1; if (ssl2_compat && ssl23_no_ssl2_ciphers(s)) ssl2_compat = 0; - if (!(s->options & SSL_OP_NO_TLSv1)) - { + /* + * SSL_OP_NO_X disables all protocols above X *if* there are + * some protocols below X enabled. This is required in order + * to maintain "version capability" vector contiguous. So + * that if application wants to disable TLS1.0 in favour of + * TLS1>=1, it would be insufficient to pass SSL_NO_TLSv1, the + * answer is SSL_OP_NO_TLSv1|SSL_OP_NO_SSLv3|SSL_OP_NO_SSLv2. + */ + mask = SSL_OP_NO_TLSv1_1|SSL_OP_NO_TLSv1 +#if !defined(OPENSSL_NO_SSL3) + |SSL_OP_NO_SSLv3 +#endif +#if !defined(OPENSSL_NO_SSL2) + |(ssl2_compat?SSL_OP_NO_SSLv2:0) +#endif + ; +#if !defined(OPENSSL_NO_TLS1_2_CLIENT) + version = TLS1_2_VERSION; + + if ((options & SSL_OP_NO_TLSv1_2) && (options & mask) != mask) + version = TLS1_1_VERSION; +#else + version = TLS1_1_VERSION; +#endif + mask &= ~SSL_OP_NO_TLSv1_1; + if ((options & SSL_OP_NO_TLSv1_1) && (options & mask) != mask) version = TLS1_VERSION; - } - else if (!(s->options & SSL_OP_NO_SSLv3)) - { + mask &= ~SSL_OP_NO_TLSv1; +#if !defined(OPENSSL_NO_SSL3) + if ((options & SSL_OP_NO_TLSv1) && (options & mask) != mask) version = SSL3_VERSION; - } - else if (!(s->options & SSL_OP_NO_SSLv2)) - { + mask &= ~SSL_OP_NO_SSLv3; +#endif +#if !defined(OPENSSL_NO_SSL2) + if ((options & SSL_OP_NO_SSLv3) && (options & mask) != mask) version = SSL2_VERSION; - } +#endif + #ifndef OPENSSL_NO_TLSEXT if (version != SSL2_VERSION) { @@ -329,11 +360,29 @@ static int ssl23_client_hello(SSL *s) if (RAND_pseudo_bytes(p,SSL3_RANDOM_SIZE-4) <= 0) return -1; - if (version == TLS1_VERSION) + if (version == TLS1_2_VERSION) + { + version_major = TLS1_2_VERSION_MAJOR; + version_minor = TLS1_2_VERSION_MINOR; + } + else if (version == TLS1_1_VERSION) + { + version_major = TLS1_1_VERSION_MAJOR; + version_minor = TLS1_1_VERSION_MINOR; + } + else if (version == TLS1_VERSION) { version_major = TLS1_VERSION_MAJOR; version_minor = TLS1_VERSION_MINOR; } +#ifdef OPENSSL_FIPS + else if(FIPS_mode()) + { + SSLerr(SSL_F_SSL23_CLIENT_HELLO, + SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE); + return -1; + } +#endif else if (version == SSL3_VERSION) { version_major = SSL3_VERSION_MAJOR; @@ -437,6 +486,15 @@ static int ssl23_client_hello(SSL *s) SSLerr(SSL_F_SSL23_CLIENT_HELLO,SSL_R_NO_CIPHERS_AVAILABLE); return -1; } +#ifdef OPENSSL_MAX_TLS1_2_CIPHER_LENGTH + /* Some servers hang if client hello > 256 bytes + * as hack workaround chop number of supported ciphers + * to keep it well below this if we use TLS v1.2 + */ + if (TLS1_get_version(s) >= TLS1_2_VERSION + && i > OPENSSL_MAX_TLS1_2_CIPHER_LENGTH) + i = OPENSSL_MAX_TLS1_2_CIPHER_LENGTH & ~1; +#endif s2n(i,p); p+=i; @@ -491,8 +549,13 @@ static int ssl23_client_hello(SSL *s) d=buf; *(d++) = SSL3_RT_HANDSHAKE; *(d++) = version_major; - *(d++) = version_minor; /* arguably we should send the *lowest* suported version here - * (indicating, e.g., TLS 1.0 in "SSL 3.0 format") */ + /* Some servers hang if we use long client hellos + * and a record number > TLS 1.0. + */ + if (TLS1_get_client_version(s) > TLS1_VERSION) + *(d++) = 1; + else + *(d++) = version_minor; s2n((int)l,d); /* number of bytes to write */ @@ -608,7 +671,7 @@ static int ssl23_get_server_hello(SSL *s) #endif } else if (p[1] == SSL3_VERSION_MAJOR && - (p[2] == SSL3_VERSION_MINOR || p[2] == TLS1_VERSION_MINOR) && + p[2] <= TLS1_2_VERSION_MINOR && ((p[0] == SSL3_RT_HANDSHAKE && p[5] == SSL3_MT_SERVER_HELLO) || (p[0] == SSL3_RT_ALERT && p[3] == 0 && p[4] == 2))) { @@ -617,6 +680,14 @@ static int ssl23_get_server_hello(SSL *s) if ((p[2] == SSL3_VERSION_MINOR) && !(s->options & SSL_OP_NO_SSLv3)) { +#ifdef OPENSSL_FIPS + if(FIPS_mode()) + { + SSLerr(SSL_F_SSL23_GET_SERVER_HELLO, + SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE); + goto err; + } +#endif s->version=SSL3_VERSION; s->method=SSLv3_client_method(); } @@ -626,6 +697,18 @@ static int ssl23_get_server_hello(SSL *s) s->version=TLS1_VERSION; s->method=TLSv1_client_method(); } + else if ((p[2] == TLS1_1_VERSION_MINOR) && + !(s->options & SSL_OP_NO_TLSv1_1)) + { + s->version=TLS1_1_VERSION; + s->method=TLSv1_1_client_method(); + } + else if ((p[2] == TLS1_2_VERSION_MINOR) && + !(s->options & SSL_OP_NO_TLSv1_2)) + { + s->version=TLS1_2_VERSION; + s->method=TLSv1_2_client_method(); + } else { SSLerr(SSL_F_SSL23_GET_SERVER_HELLO,SSL_R_UNSUPPORTED_PROTOCOL); diff --git a/src/lib/libssl/s23_srvr.c b/src/lib/libssl/s23_srvr.c index 836dd1f1cf..4877849013 100644 --- a/src/lib/libssl/s23_srvr.c +++ b/src/lib/libssl/s23_srvr.c @@ -115,6 +115,9 @@ #include #include #include +#ifdef OPENSSL_FIPS +#include +#endif static const SSL_METHOD *ssl23_get_server_method(int ver); int ssl23_get_client_hello(SSL *s); @@ -128,6 +131,10 @@ static const SSL_METHOD *ssl23_get_server_method(int ver) return(SSLv3_server_method()); else if (ver == TLS1_VERSION) return(TLSv1_server_method()); + else if (ver == TLS1_1_VERSION) + return(TLSv1_1_server_method()); + else if (ver == TLS1_2_VERSION) + return(TLSv1_2_server_method()); else return(NULL); } @@ -283,7 +290,20 @@ int ssl23_get_client_hello(SSL *s) /* SSLv3/TLSv1 */ if (p[4] >= TLS1_VERSION_MINOR) { - if (!(s->options & SSL_OP_NO_TLSv1)) + if (p[4] >= TLS1_2_VERSION_MINOR && + !(s->options & SSL_OP_NO_TLSv1_2)) + { + s->version=TLS1_2_VERSION; + s->state=SSL23_ST_SR_CLNT_HELLO_B; + } + else if (p[4] >= TLS1_1_VERSION_MINOR && + !(s->options & SSL_OP_NO_TLSv1_1)) + { + s->version=TLS1_1_VERSION; + /* type=2; */ /* done later to survive restarts */ + s->state=SSL23_ST_SR_CLNT_HELLO_B; + } + else if (!(s->options & SSL_OP_NO_TLSv1)) { s->version=TLS1_VERSION; /* type=2; */ /* done later to survive restarts */ @@ -350,7 +370,19 @@ int ssl23_get_client_hello(SSL *s) v[1]=p[10]; /* minor version according to client_version */ if (v[1] >= TLS1_VERSION_MINOR) { - if (!(s->options & SSL_OP_NO_TLSv1)) + if (v[1] >= TLS1_2_VERSION_MINOR && + !(s->options & SSL_OP_NO_TLSv1_2)) + { + s->version=TLS1_2_VERSION; + type=3; + } + else if (v[1] >= TLS1_1_VERSION_MINOR && + !(s->options & SSL_OP_NO_TLSv1_1)) + { + s->version=TLS1_1_VERSION; + type=3; + } + else if (!(s->options & SSL_OP_NO_TLSv1)) { s->version=TLS1_VERSION; type=3; @@ -393,6 +425,15 @@ int ssl23_get_client_hello(SSL *s) } } +#ifdef OPENSSL_FIPS + if (FIPS_mode() && (s->version < TLS1_VERSION)) + { + SSLerr(SSL_F_SSL23_GET_CLIENT_HELLO, + SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE); + goto err; + } +#endif + if (s->state == SSL23_ST_SR_CLNT_HELLO_B) { /* we have SSLv3/TLSv1 in an SSLv2 header @@ -567,8 +608,11 @@ int ssl23_get_client_hello(SSL *s) s->s3->rbuf.left=0; s->s3->rbuf.offset=0; } - - if (s->version == TLS1_VERSION) + if (s->version == TLS1_2_VERSION) + s->method = TLSv1_2_server_method(); + else if (s->version == TLS1_1_VERSION) + s->method = TLSv1_1_server_method(); + else if (s->version == TLS1_VERSION) s->method = TLSv1_server_method(); else s->method = SSLv3_server_method(); diff --git a/src/lib/libssl/s3_both.c b/src/lib/libssl/s3_both.c index a6d869df59..b63460a56d 100644 --- a/src/lib/libssl/s3_both.c +++ b/src/lib/libssl/s3_both.c @@ -202,15 +202,38 @@ int ssl3_send_finished(SSL *s, int a, int b, const char *sender, int slen) return(ssl3_do_write(s,SSL3_RT_HANDSHAKE)); } +#ifndef OPENSSL_NO_NEXTPROTONEG +/* ssl3_take_mac calculates the Finished MAC for the handshakes messages seen to far. */ +static void ssl3_take_mac(SSL *s) { + const char *sender; + int slen; + + if (s->state & SSL_ST_CONNECT) + { + sender=s->method->ssl3_enc->server_finished_label; + slen=s->method->ssl3_enc->server_finished_label_len; + } + else + { + sender=s->method->ssl3_enc->client_finished_label; + slen=s->method->ssl3_enc->client_finished_label_len; + } + + s->s3->tmp.peer_finish_md_len = s->method->ssl3_enc->final_finish_mac(s, + sender,slen,s->s3->tmp.peer_finish_md); +} +#endif + int ssl3_get_finished(SSL *s, int a, int b) { int al,i,ok; long n; unsigned char *p; - /* the mac has already been generated when we received the - * change cipher spec message and is in s->s3->tmp.peer_finish_md - */ +#ifdef OPENSSL_NO_NEXTPROTONEG + /* the mac has already been generated when we received the change + * cipher spec message and is in s->s3->tmp.peer_finish_md. */ +#endif n=s->method->ssl_get_message(s, a, @@ -514,6 +537,13 @@ long ssl3_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok) s->init_num += i; n -= i; } +#ifndef OPENSSL_NO_NEXTPROTONEG + /* If receiving Finished, record MAC of prior handshake messages for + * Finished verification. */ + if (*s->init_buf->data == SSL3_MT_FINISHED) + ssl3_take_mac(s); +#endif + /* Feed this message into MAC computation. */ ssl3_finish_mac(s, (unsigned char *)s->init_buf->data, s->init_num + 4); if (s->msg_callback) s->msg_callback(0, s->version, SSL3_RT_HANDSHAKE, s->init_buf->data, (size_t)s->init_num + 4, s, s->msg_callback_arg); diff --git a/src/lib/libssl/s3_clnt.c b/src/lib/libssl/s3_clnt.c index 53223bd38d..b80d052e1f 100644 --- a/src/lib/libssl/s3_clnt.c +++ b/src/lib/libssl/s3_clnt.c @@ -156,6 +156,9 @@ #include #include #include +#ifdef OPENSSL_FIPS +#include +#endif #ifndef OPENSSL_NO_DH #include #endif @@ -200,6 +203,18 @@ int ssl3_connect(SSL *s) s->in_handshake++; if (!SSL_in_init(s) || SSL_in_before(s)) SSL_clear(s); +#ifndef OPENSSL_NO_HEARTBEATS + /* If we're awaiting a HeartbeatResponse, pretend we + * already got and don't await it anymore, because + * Heartbeats don't make sense during handshakes anyway. + */ + if (s->tlsext_hb_pending) + { + s->tlsext_hb_pending = 0; + s->tlsext_hb_seq++; + } +#endif + for (;;) { state=s->state; @@ -207,7 +222,7 @@ int ssl3_connect(SSL *s) switch(s->state) { case SSL_ST_RENEGOTIATE: - s->new_session=1; + s->renegotiate=1; s->state=SSL_ST_CONNECT; s->ctx->stats.sess_connect_renegotiate++; /* break */ @@ -280,7 +295,16 @@ int ssl3_connect(SSL *s) if (ret <= 0) goto end; if (s->hit) + { s->state=SSL3_ST_CR_FINISHED_A; +#ifndef OPENSSL_NO_TLSEXT + if (s->tlsext_ticket_expected) + { + /* receive renewed session ticket */ + s->state=SSL3_ST_CR_SESSION_TICKET_A; + } +#endif + } else s->state=SSL3_ST_CR_CERT_A; s->init_num=0; @@ -358,6 +382,17 @@ int ssl3_connect(SSL *s) case SSL3_ST_CR_SRVR_DONE_B: ret=ssl3_get_server_done(s); if (ret <= 0) goto end; +#ifndef OPENSSL_NO_SRP + if (s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP) + { + if ((ret = SRP_Calc_A_param(s))<=0) + { + SSLerr(SSL_F_SSL3_CONNECT,SSL_R_SRP_A_CALC); + ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_INTERNAL_ERROR); + goto end; + } + } +#endif if (s->s3->tmp.cert_req) s->state=SSL3_ST_CW_CERT_A; else @@ -423,7 +458,16 @@ int ssl3_connect(SSL *s) ret=ssl3_send_change_cipher_spec(s, SSL3_ST_CW_CHANGE_A,SSL3_ST_CW_CHANGE_B); if (ret <= 0) goto end; + + +#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG) s->state=SSL3_ST_CW_FINISHED_A; +#else + if (s->s3->next_proto_neg_seen) + s->state=SSL3_ST_CW_NEXT_PROTO_A; + else + s->state=SSL3_ST_CW_FINISHED_A; +#endif s->init_num=0; s->session->cipher=s->s3->tmp.new_cipher; @@ -451,6 +495,15 @@ int ssl3_connect(SSL *s) break; +#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) + case SSL3_ST_CW_NEXT_PROTO_A: + case SSL3_ST_CW_NEXT_PROTO_B: + ret=ssl3_send_next_proto(s); + if (ret <= 0) goto end; + s->state=SSL3_ST_CW_FINISHED_A; + break; +#endif + case SSL3_ST_CW_FINISHED_A: case SSL3_ST_CW_FINISHED_B: ret=ssl3_send_finished(s, @@ -546,6 +599,7 @@ int ssl3_connect(SSL *s) /* else do it later in ssl3_write */ s->init_num=0; + s->renegotiate=0; s->new_session=0; ssl_update_cache(s,SSL_SESS_CACHE_CLIENT); @@ -635,9 +689,43 @@ int ssl3_client_hello(SSL *s) /* Do the message type and length last */ d=p= &(buf[4]); + /* version indicates the negotiated version: for example from + * an SSLv2/v3 compatible client hello). The client_version + * field is the maximum version we permit and it is also + * used in RSA encrypted premaster secrets. Some servers can + * choke if we initially report a higher version then + * renegotiate to a lower one in the premaster secret. This + * didn't happen with TLS 1.0 as most servers supported it + * but it can with TLS 1.1 or later if the server only supports + * 1.0. + * + * Possible scenario with previous logic: + * 1. Client hello indicates TLS 1.2 + * 2. Server hello says TLS 1.0 + * 3. RSA encrypted premaster secret uses 1.2. + * 4. Handhaked proceeds using TLS 1.0. + * 5. Server sends hello request to renegotiate. + * 6. Client hello indicates TLS v1.0 as we now + * know that is maximum server supports. + * 7. Server chokes on RSA encrypted premaster secret + * containing version 1.0. + * + * For interoperability it should be OK to always use the + * maximum version we support in client hello and then rely + * on the checking of version to ensure the servers isn't + * being inconsistent: for example initially negotiating with + * TLS 1.0 and renegotiating with TLS 1.2. We do this by using + * client_version in client hello and not resetting it to + * the negotiated version. + */ +#if 0 *(p++)=s->version>>8; *(p++)=s->version&0xff; s->client_version=s->version; +#else + *(p++)=s->client_version>>8; + *(p++)=s->client_version&0xff; +#endif /* Random stuff */ memcpy(p,s->s3->client_random,SSL3_RANDOM_SIZE); @@ -667,6 +755,15 @@ int ssl3_client_hello(SSL *s) SSLerr(SSL_F_SSL3_CLIENT_HELLO,SSL_R_NO_CIPHERS_AVAILABLE); goto err; } +#ifdef OPENSSL_MAX_TLS1_2_CIPHER_LENGTH + /* Some servers hang if client hello > 256 bytes + * as hack workaround chop number of supported ciphers + * to keep it well below this if we use TLS v1.2 + */ + if (TLS1_get_version(s) >= TLS1_2_VERSION + && i > OPENSSL_MAX_TLS1_2_CIPHER_LENGTH) + i = OPENSSL_MAX_TLS1_2_CIPHER_LENGTH & ~1; +#endif s2n(i,p); p+=i; @@ -847,6 +944,14 @@ int ssl3_get_server_hello(SSL *s) SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_UNKNOWN_CIPHER_RETURNED); goto f_err; } + /* TLS v1.2 only ciphersuites require v1.2 or later */ + if ((c->algorithm_ssl & SSL_TLSV1_2) && + (TLS1_get_version(s) < TLS1_2_VERSION)) + { + al=SSL_AD_ILLEGAL_PARAMETER; + SSLerr(SSL_F_SSL3_GET_SERVER_HELLO,SSL_R_WRONG_CIPHER_RETURNED); + goto f_err; + } p+=ssl_put_cipher_by_char(s,NULL,NULL); sk=ssl_get_ciphers_by_id(s); @@ -878,9 +983,11 @@ int ssl3_get_server_hello(SSL *s) } } s->s3->tmp.new_cipher=c; - if (!ssl3_digest_cached_records(s)) + /* Don't digest cached records if TLS v1.2: we may need them for + * client authentication. + */ + if (TLS1_get_version(s) < TLS1_2_VERSION && !ssl3_digest_cached_records(s)) goto f_err; - /* lets get the compression algorithm */ /* COMPRESSION */ #ifdef OPENSSL_NO_COMP @@ -1159,6 +1266,7 @@ int ssl3_get_key_exchange(SSL *s) int al,i,j,param_len,ok; long n,alg_k,alg_a; EVP_PKEY *pkey=NULL; + const EVP_MD *md = NULL; #ifndef OPENSSL_NO_RSA RSA *rsa=NULL; #endif @@ -1282,6 +1390,86 @@ int ssl3_get_key_exchange(SSL *s) } else #endif /* !OPENSSL_NO_PSK */ +#ifndef OPENSSL_NO_SRP + if (alg_k & SSL_kSRP) + { + n2s(p,i); + param_len=i+2; + if (param_len > n) + { + al=SSL_AD_DECODE_ERROR; + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_N_LENGTH); + goto f_err; + } + if (!(s->srp_ctx.N=BN_bin2bn(p,i,NULL))) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB); + goto err; + } + p+=i; + + n2s(p,i); + param_len+=i+2; + if (param_len > n) + { + al=SSL_AD_DECODE_ERROR; + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_G_LENGTH); + goto f_err; + } + if (!(s->srp_ctx.g=BN_bin2bn(p,i,NULL))) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB); + goto err; + } + p+=i; + + i = (unsigned int)(p[0]); + p++; + param_len+=i+1; + if (param_len > n) + { + al=SSL_AD_DECODE_ERROR; + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_S_LENGTH); + goto f_err; + } + if (!(s->srp_ctx.s=BN_bin2bn(p,i,NULL))) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB); + goto err; + } + p+=i; + + n2s(p,i); + param_len+=i+2; + if (param_len > n) + { + al=SSL_AD_DECODE_ERROR; + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SRP_B_LENGTH); + goto f_err; + } + if (!(s->srp_ctx.B=BN_bin2bn(p,i,NULL))) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_BN_LIB); + goto err; + } + p+=i; + n-=param_len; + +/* We must check if there is a certificate */ +#ifndef OPENSSL_NO_RSA + if (alg_a & SSL_aRSA) + pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_RSA_ENC].x509); +#else + if (0) + ; +#endif +#ifndef OPENSSL_NO_DSA + else if (alg_a & SSL_aDSS) + pkey=X509_get_pubkey(s->session->sess_cert->peer_pkeys[SSL_PKEY_DSA_SIGN].x509); +#endif + } + else +#endif /* !OPENSSL_NO_SRP */ #ifndef OPENSSL_NO_RSA if (alg_k & SSL_kRSA) { @@ -1529,6 +1717,38 @@ int ssl3_get_key_exchange(SSL *s) /* if it was signed, check the signature */ if (pkey != NULL) { + if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + int sigalg = tls12_get_sigid(pkey); + /* Should never happen */ + if (sigalg == -1) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR); + goto err; + } + /* Check key type is consistent with signature */ + if (sigalg != (int)p[1]) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_WRONG_SIGNATURE_TYPE); + al=SSL_AD_DECODE_ERROR; + goto f_err; + } + md = tls12_get_hash(p[0]); + if (md == NULL) + { + SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_UNKNOWN_DIGEST); + al=SSL_AD_DECODE_ERROR; + goto f_err; + } +#ifdef SSL_DEBUG +fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md)); +#endif + p += 2; + n -= 2; + } + else + md = EVP_sha1(); + n2s(p,i); n-=2; j=EVP_PKEY_size(pkey); @@ -1542,7 +1762,7 @@ int ssl3_get_key_exchange(SSL *s) } #ifndef OPENSSL_NO_RSA - if (pkey->type == EVP_PKEY_RSA) + if (pkey->type == EVP_PKEY_RSA && TLS1_get_version(s) < TLS1_2_VERSION) { int num; @@ -1550,6 +1770,8 @@ int ssl3_get_key_exchange(SSL *s) q=md_buf; for (num=2; num > 0; num--) { + EVP_MD_CTX_set_flags(&md_ctx, + EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); EVP_DigestInit_ex(&md_ctx,(num == 2) ?s->ctx->md5:s->ctx->sha1, NULL); EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE); @@ -1577,29 +1799,8 @@ int ssl3_get_key_exchange(SSL *s) } else #endif -#ifndef OPENSSL_NO_DSA - if (pkey->type == EVP_PKEY_DSA) - { - /* lets do DSS */ - EVP_VerifyInit_ex(&md_ctx,EVP_dss1(), NULL); - EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE); - EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE); - EVP_VerifyUpdate(&md_ctx,param,param_len); - if (EVP_VerifyFinal(&md_ctx,p,(int)n,pkey) <= 0) - { - /* bad signature */ - al=SSL_AD_DECRYPT_ERROR; - SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,SSL_R_BAD_SIGNATURE); - goto f_err; - } - } - else -#endif -#ifndef OPENSSL_NO_ECDSA - if (pkey->type == EVP_PKEY_EC) { - /* let's do ECDSA */ - EVP_VerifyInit_ex(&md_ctx,EVP_ecdsa(), NULL); + EVP_VerifyInit_ex(&md_ctx, md, NULL); EVP_VerifyUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE); EVP_VerifyUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE); EVP_VerifyUpdate(&md_ctx,param,param_len); @@ -1611,12 +1812,6 @@ int ssl3_get_key_exchange(SSL *s) goto f_err; } } - else -#endif - { - SSLerr(SSL_F_SSL3_GET_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR); - goto err; - } } else { @@ -1663,7 +1858,7 @@ int ssl3_get_certificate_request(SSL *s) { int ok,ret=0; unsigned long n,nc,l; - unsigned int llen,ctype_num,i; + unsigned int llen, ctype_num,i; X509_NAME *xn=NULL; const unsigned char *p,*q; unsigned char *d; @@ -1683,6 +1878,14 @@ int ssl3_get_certificate_request(SSL *s) if (s->s3->tmp.message_type == SSL3_MT_SERVER_DONE) { s->s3->tmp.reuse_message=1; + /* If we get here we don't need any cached handshake records + * as we wont be doing client auth. + */ + if (s->s3->handshake_buffer) + { + if (!ssl3_digest_cached_records(s)) + goto err; + } return(1); } @@ -1719,6 +1922,26 @@ int ssl3_get_certificate_request(SSL *s) for (i=0; is3->tmp.ctype[i]= p[i]; p+=ctype_num; + if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + n2s(p, llen); + /* Check we have enough room for signature algorithms and + * following length value. + */ + if ((unsigned long)(p - d + llen + 2) > n) + { + ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR); + SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_DATA_LENGTH_TOO_LONG); + goto err; + } + if ((llen & 1) || !tls1_process_sigalgs(s, p, llen)) + { + ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR); + SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_SIGNATURE_ALGORITHMS_ERROR); + goto err; + } + p += llen; + } /* get the CA RDNs */ n2s(p,llen); @@ -1731,7 +1954,7 @@ fclose(out); } #endif - if ((llen+ctype_num+2+1) != n) + if ((unsigned long)(p - d + llen) != n) { ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECODE_ERROR); SSLerr(SSL_F_SSL3_GET_CERTIFICATE_REQUEST,SSL_R_LENGTH_MISMATCH); @@ -2553,6 +2776,39 @@ int ssl3_send_client_key_exchange(SSL *s) EVP_PKEY_free(pub_key); } +#ifndef OPENSSL_NO_SRP + else if (alg_k & SSL_kSRP) + { + if (s->srp_ctx.A != NULL) + { + /* send off the data */ + n=BN_num_bytes(s->srp_ctx.A); + s2n(n,p); + BN_bn2bin(s->srp_ctx.A,p); + n+=2; + } + else + { + SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR); + goto err; + } + if (s->session->srp_username != NULL) + OPENSSL_free(s->session->srp_username); + s->session->srp_username = BUF_strdup(s->srp_ctx.login); + if (s->session->srp_username == NULL) + { + SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE, + ERR_R_MALLOC_FAILURE); + goto err; + } + + if ((s->session->master_key_length = SRP_generate_client_master_secret(s,s->session->master_key))<0) + { + SSLerr(SSL_F_SSL3_SEND_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR); + goto err; + } + } +#endif #ifndef OPENSSL_NO_PSK else if (alg_k & SSL_kPSK) { @@ -2672,12 +2928,13 @@ int ssl3_send_client_verify(SSL *s) unsigned char data[MD5_DIGEST_LENGTH+SHA_DIGEST_LENGTH]; EVP_PKEY *pkey; EVP_PKEY_CTX *pctx=NULL; -#ifndef OPENSSL_NO_RSA + EVP_MD_CTX mctx; unsigned u=0; -#endif unsigned long n; int j; + EVP_MD_CTX_init(&mctx); + if (s->state == SSL3_ST_CW_CERT_VRFY_A) { d=(unsigned char *)s->init_buf->data; @@ -2688,7 +2945,8 @@ int ssl3_send_client_verify(SSL *s) EVP_PKEY_sign_init(pctx); if (EVP_PKEY_CTX_set_signature_md(pctx, EVP_sha1())>0) { - s->method->ssl3_enc->cert_verify_mac(s, + if (TLS1_get_version(s) < TLS1_2_VERSION) + s->method->ssl3_enc->cert_verify_mac(s, NID_sha1, &(data[MD5_DIGEST_LENGTH])); } @@ -2696,6 +2954,41 @@ int ssl3_send_client_verify(SSL *s) { ERR_clear_error(); } + /* For TLS v1.2 send signature algorithm and signature + * using agreed digest and cached handshake records. + */ + if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + long hdatalen = 0; + void *hdata; + const EVP_MD *md = s->cert->key->digest; + hdatalen = BIO_get_mem_data(s->s3->handshake_buffer, + &hdata); + if (hdatalen <= 0 || !tls12_get_sigandhash(p, pkey, md)) + { + SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY, + ERR_R_INTERNAL_ERROR); + goto err; + } + p += 2; +#ifdef SSL_DEBUG + fprintf(stderr, "Using TLS 1.2 with client alg %s\n", + EVP_MD_name(md)); +#endif + if (!EVP_SignInit_ex(&mctx, md, NULL) + || !EVP_SignUpdate(&mctx, hdata, hdatalen) + || !EVP_SignFinal(&mctx, p + 2, &u, pkey)) + { + SSLerr(SSL_F_SSL3_SEND_CLIENT_VERIFY, + ERR_R_EVP_LIB); + goto err; + } + s2n(u,p); + n = u + 4; + if (!ssl3_digest_cached_records(s)) + goto err; + } + else #ifndef OPENSSL_NO_RSA if (pkey->type == EVP_PKEY_RSA) { @@ -2778,9 +3071,11 @@ int ssl3_send_client_verify(SSL *s) s->init_num=(int)n+4; s->init_off=0; } + EVP_MD_CTX_cleanup(&mctx); EVP_PKEY_CTX_free(pctx); return(ssl3_do_write(s,SSL3_RT_HANDSHAKE)); err: + EVP_MD_CTX_cleanup(&mctx); EVP_PKEY_CTX_free(pctx); return(-1); } @@ -2904,7 +3199,7 @@ int ssl3_check_cert_and_algorithm(SSL *s) if (idx == SSL_PKEY_ECC) { if (ssl_check_srvr_ecc_cert_and_alg(sc->peer_pkeys[idx].x509, - s->s3->tmp.new_cipher) == 0) + s) == 0) { /* check failed */ SSLerr(SSL_F_SSL3_CHECK_CERT_AND_ALGORITHM,SSL_R_BAD_ECC_CERT); goto f_err; @@ -3000,6 +3295,32 @@ err: return(0); } +#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) +int ssl3_send_next_proto(SSL *s) + { + unsigned int len, padding_len; + unsigned char *d; + + if (s->state == SSL3_ST_CW_NEXT_PROTO_A) + { + len = s->next_proto_negotiated_len; + padding_len = 32 - ((len + 2) % 32); + d = (unsigned char *)s->init_buf->data; + d[4] = len; + memcpy(d + 5, s->next_proto_negotiated, len); + d[5 + len] = padding_len; + memset(d + 6 + len, 0, padding_len); + *(d++)=SSL3_MT_NEXT_PROTO; + l2n3(2 + len + padding_len, d); + s->state = SSL3_ST_CW_NEXT_PROTO_B; + s->init_num = 4 + 2 + len + padding_len; + s->init_off = 0; + } + + return ssl3_do_write(s, SSL3_RT_HANDSHAKE); +} +#endif /* !OPENSSL_NO_TLSEXT && !OPENSSL_NO_NEXTPROTONEG */ + /* Check to see if handshake is full or resumed. Usually this is just a * case of checking to see if a cache hit has occurred. In the case of * session tickets we have to check the next message to be sure. diff --git a/src/lib/libssl/s3_lib.c b/src/lib/libssl/s3_lib.c index 1130244aeb..fb60cde8ee 100644 --- a/src/lib/libssl/s3_lib.c +++ b/src/lib/libssl/s3_lib.c @@ -1071,6 +1071,103 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ 256, }, + /* TLS v1.2 ciphersuites */ + /* Cipher 3B */ + { + 1, + TLS1_TXT_RSA_WITH_NULL_SHA256, + TLS1_CK_RSA_WITH_NULL_SHA256, + SSL_kRSA, + SSL_aRSA, + SSL_eNULL, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 0, + 0, + }, + + /* Cipher 3C */ + { + 1, + TLS1_TXT_RSA_WITH_AES_128_SHA256, + TLS1_CK_RSA_WITH_AES_128_SHA256, + SSL_kRSA, + SSL_aRSA, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher 3D */ + { + 1, + TLS1_TXT_RSA_WITH_AES_256_SHA256, + TLS1_CK_RSA_WITH_AES_256_SHA256, + SSL_kRSA, + SSL_aRSA, + SSL_AES256, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher 3E */ + { + 0, /* not implemented (non-ephemeral DH) */ + TLS1_TXT_DH_DSS_WITH_AES_128_SHA256, + TLS1_CK_DH_DSS_WITH_AES_128_SHA256, + SSL_kDHr, + SSL_aDH, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher 3F */ + { + 0, /* not implemented (non-ephemeral DH) */ + TLS1_TXT_DH_RSA_WITH_AES_128_SHA256, + TLS1_CK_DH_RSA_WITH_AES_128_SHA256, + SSL_kDHr, + SSL_aDH, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher 40 */ + { + 1, + TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256, + TLS1_CK_DHE_DSS_WITH_AES_128_SHA256, + SSL_kEDH, + SSL_aDSS, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + #ifndef OPENSSL_NO_CAMELLIA /* Camellia ciphersuites from RFC4132 (128-bit portion) */ @@ -1287,6 +1384,122 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ 128, }, #endif + + /* TLS v1.2 ciphersuites */ + /* Cipher 67 */ + { + 1, + TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256, + TLS1_CK_DHE_RSA_WITH_AES_128_SHA256, + SSL_kEDH, + SSL_aRSA, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher 68 */ + { + 0, /* not implemented (non-ephemeral DH) */ + TLS1_TXT_DH_DSS_WITH_AES_256_SHA256, + TLS1_CK_DH_DSS_WITH_AES_256_SHA256, + SSL_kDHr, + SSL_aDH, + SSL_AES256, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher 69 */ + { + 0, /* not implemented (non-ephemeral DH) */ + TLS1_TXT_DH_RSA_WITH_AES_256_SHA256, + TLS1_CK_DH_RSA_WITH_AES_256_SHA256, + SSL_kDHr, + SSL_aDH, + SSL_AES256, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher 6A */ + { + 1, + TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256, + TLS1_CK_DHE_DSS_WITH_AES_256_SHA256, + SSL_kEDH, + SSL_aDSS, + SSL_AES256, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher 6B */ + { + 1, + TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256, + TLS1_CK_DHE_RSA_WITH_AES_256_SHA256, + SSL_kEDH, + SSL_aRSA, + SSL_AES256, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher 6C */ + { + 1, + TLS1_TXT_ADH_WITH_AES_128_SHA256, + TLS1_CK_ADH_WITH_AES_128_SHA256, + SSL_kEDH, + SSL_aNULL, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher 6D */ + { + 1, + TLS1_TXT_ADH_WITH_AES_256_SHA256, + TLS1_CK_ADH_WITH_AES_256_SHA256, + SSL_kEDH, + SSL_aNULL, + SSL_AES256, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* GOST Ciphersuites */ + { 1, "GOST94-GOST89-GOST89", @@ -1610,6 +1823,200 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ #endif /* OPENSSL_NO_SEED */ + /* GCM ciphersuites from RFC5288 */ + + /* Cipher 9C */ + { + 1, + TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_RSA_WITH_AES_128_GCM_SHA256, + SSL_kRSA, + SSL_aRSA, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher 9D */ + { + 1, + TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_RSA_WITH_AES_256_GCM_SHA384, + SSL_kRSA, + SSL_aRSA, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher 9E */ + { + 1, + TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256, + SSL_kEDH, + SSL_aRSA, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher 9F */ + { + 1, + TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384, + SSL_kEDH, + SSL_aRSA, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher A0 */ + { + 0, + TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256, + SSL_kDHr, + SSL_aDH, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher A1 */ + { + 0, + TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384, + SSL_kDHr, + SSL_aDH, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher A2 */ + { + 1, + TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256, + TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256, + SSL_kEDH, + SSL_aDSS, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher A3 */ + { + 1, + TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384, + TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384, + SSL_kEDH, + SSL_aDSS, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher A4 */ + { + 0, + TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256, + TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256, + SSL_kDHr, + SSL_aDH, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher A5 */ + { + 0, + TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384, + TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384, + SSL_kDHr, + SSL_aDH, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher A6 */ + { + 1, + TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256, + TLS1_CK_ADH_WITH_AES_128_GCM_SHA256, + SSL_kEDH, + SSL_aNULL, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher A7 */ + { + 1, + TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384, + TLS1_CK_ADH_WITH_AES_256_GCM_SHA384, + SSL_kEDH, + SSL_aNULL, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + #ifndef OPENSSL_NO_ECDH /* Cipher C001 */ { @@ -1621,7 +2028,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_eNULL, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_STRONG_NONE, + SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 0, 0, @@ -1653,7 +2060,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_3DES, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 168, 168, @@ -1669,7 +2076,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_AES128, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 128, 128, @@ -1685,7 +2092,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_AES256, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 256, 256, @@ -1701,7 +2108,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_eNULL, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_STRONG_NONE, + SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 0, 0, @@ -1733,7 +2140,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_3DES, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 168, 168, @@ -1749,7 +2156,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_AES128, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 128, 128, @@ -1765,7 +2172,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_AES256, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 256, 256, @@ -1781,7 +2188,7 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ SSL_eNULL, SSL_SHA1, SSL_TLSV1, - SSL_NOT_EXP|SSL_STRONG_NONE, + SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS, SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, 0, 0, @@ -1803,214 +2210,624 @@ OPENSSL_GLOBAL SSL_CIPHER ssl3_ciphers[]={ 128, }, - /* Cipher C00D */ + /* Cipher C00D */ + { + 1, + TLS1_TXT_ECDH_RSA_WITH_DES_192_CBC3_SHA, + TLS1_CK_ECDH_RSA_WITH_DES_192_CBC3_SHA, + SSL_kECDHr, + SSL_aECDH, + SSL_3DES, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 168, + 168, + }, + + /* Cipher C00E */ + { + 1, + TLS1_TXT_ECDH_RSA_WITH_AES_128_CBC_SHA, + TLS1_CK_ECDH_RSA_WITH_AES_128_CBC_SHA, + SSL_kECDHr, + SSL_aECDH, + SSL_AES128, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C00F */ + { + 1, + TLS1_TXT_ECDH_RSA_WITH_AES_256_CBC_SHA, + TLS1_CK_ECDH_RSA_WITH_AES_256_CBC_SHA, + SSL_kECDHr, + SSL_aECDH, + SSL_AES256, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher C010 */ + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_NULL_SHA, + TLS1_CK_ECDHE_RSA_WITH_NULL_SHA, + SSL_kEECDH, + SSL_aRSA, + SSL_eNULL, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 0, + 0, + }, + + /* Cipher C011 */ + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_RC4_128_SHA, + TLS1_CK_ECDHE_RSA_WITH_RC4_128_SHA, + SSL_kEECDH, + SSL_aRSA, + SSL_RC4, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_MEDIUM, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C012 */ + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_DES_192_CBC3_SHA, + TLS1_CK_ECDHE_RSA_WITH_DES_192_CBC3_SHA, + SSL_kEECDH, + SSL_aRSA, + SSL_3DES, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 168, + 168, + }, + + /* Cipher C013 */ + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_AES_128_CBC_SHA, + TLS1_CK_ECDHE_RSA_WITH_AES_128_CBC_SHA, + SSL_kEECDH, + SSL_aRSA, + SSL_AES128, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C014 */ + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_AES_256_CBC_SHA, + TLS1_CK_ECDHE_RSA_WITH_AES_256_CBC_SHA, + SSL_kEECDH, + SSL_aRSA, + SSL_AES256, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher C015 */ + { + 1, + TLS1_TXT_ECDH_anon_WITH_NULL_SHA, + TLS1_CK_ECDH_anon_WITH_NULL_SHA, + SSL_kEECDH, + SSL_aNULL, + SSL_eNULL, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_STRONG_NONE|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 0, + 0, + }, + + /* Cipher C016 */ + { + 1, + TLS1_TXT_ECDH_anon_WITH_RC4_128_SHA, + TLS1_CK_ECDH_anon_WITH_RC4_128_SHA, + SSL_kEECDH, + SSL_aNULL, + SSL_RC4, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_MEDIUM, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C017 */ + { + 1, + TLS1_TXT_ECDH_anon_WITH_DES_192_CBC3_SHA, + TLS1_CK_ECDH_anon_WITH_DES_192_CBC3_SHA, + SSL_kEECDH, + SSL_aNULL, + SSL_3DES, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 168, + 168, + }, + + /* Cipher C018 */ + { + 1, + TLS1_TXT_ECDH_anon_WITH_AES_128_CBC_SHA, + TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA, + SSL_kEECDH, + SSL_aNULL, + SSL_AES128, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C019 */ + { + 1, + TLS1_TXT_ECDH_anon_WITH_AES_256_CBC_SHA, + TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA, + SSL_kEECDH, + SSL_aNULL, + SSL_AES256, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, +#endif /* OPENSSL_NO_ECDH */ + +#ifndef OPENSSL_NO_SRP + /* Cipher C01A */ + { + 1, + TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA, + TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA, + SSL_kSRP, + SSL_aNULL, + SSL_3DES, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 168, + 168, + }, + + /* Cipher C01B */ + { + 1, + TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA, + TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA, + SSL_kSRP, + SSL_aRSA, + SSL_3DES, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 168, + 168, + }, + + /* Cipher C01C */ + { + 1, + TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA, + TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA, + SSL_kSRP, + SSL_aDSS, + SSL_3DES, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 168, + 168, + }, + + /* Cipher C01D */ + { + 1, + TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA, + TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA, + SSL_kSRP, + SSL_aNULL, + SSL_AES128, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C01E */ + { + 1, + TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA, + TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA, + SSL_kSRP, + SSL_aRSA, + SSL_AES128, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C01F */ + { + 1, + TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA, + TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA, + SSL_kSRP, + SSL_aDSS, + SSL_AES128, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 128, + 128, + }, + + /* Cipher C020 */ + { + 1, + TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA, + TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA, + SSL_kSRP, + SSL_aNULL, + SSL_AES256, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher C021 */ + { + 1, + TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA, + TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA, + SSL_kSRP, + SSL_aRSA, + SSL_AES256, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, + + /* Cipher C022 */ + { + 1, + TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA, + TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA, + SSL_kSRP, + SSL_aDSS, + SSL_AES256, + SSL_SHA1, + SSL_TLSV1, + SSL_NOT_EXP|SSL_HIGH, + SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + 256, + 256, + }, +#endif /* OPENSSL_NO_SRP */ +#ifndef OPENSSL_NO_ECDH + + /* HMAC based TLS v1.2 ciphersuites from RFC5289 */ + + /* Cipher C023 */ + { + 1, + TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256, + TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256, + SSL_kEECDH, + SSL_aECDSA, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, + }, + + /* Cipher C024 */ { 1, - TLS1_TXT_ECDH_RSA_WITH_DES_192_CBC3_SHA, - TLS1_CK_ECDH_RSA_WITH_DES_192_CBC3_SHA, - SSL_kECDHr, - SSL_aECDH, - SSL_3DES, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, - 168, - 168, + TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384, + TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384, + SSL_kEECDH, + SSL_aECDSA, + SSL_AES256, + SSL_SHA384, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, }, - /* Cipher C00E */ + /* Cipher C025 */ { 1, - TLS1_TXT_ECDH_RSA_WITH_AES_128_CBC_SHA, - TLS1_CK_ECDH_RSA_WITH_AES_128_CBC_SHA, - SSL_kECDHr, + TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256, + TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256, + SSL_kECDHe, SSL_aECDH, SSL_AES128, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, 128, 128, }, - /* Cipher C00F */ + /* Cipher C026 */ { 1, - TLS1_TXT_ECDH_RSA_WITH_AES_256_CBC_SHA, - TLS1_CK_ECDH_RSA_WITH_AES_256_CBC_SHA, - SSL_kECDHr, + TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384, + TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384, + SSL_kECDHe, SSL_aECDH, SSL_AES256, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + SSL_SHA384, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, 256, 256, }, - /* Cipher C010 */ + /* Cipher C027 */ { 1, - TLS1_TXT_ECDHE_RSA_WITH_NULL_SHA, - TLS1_CK_ECDHE_RSA_WITH_NULL_SHA, + TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256, + TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256, SSL_kEECDH, SSL_aRSA, - SSL_eNULL, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_STRONG_NONE, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, - 0, - 0, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, }, - /* Cipher C011 */ + /* Cipher C028 */ { 1, - TLS1_TXT_ECDHE_RSA_WITH_RC4_128_SHA, - TLS1_CK_ECDHE_RSA_WITH_RC4_128_SHA, + TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384, + TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384, SSL_kEECDH, SSL_aRSA, - SSL_RC4, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_MEDIUM, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + SSL_AES256, + SSL_SHA384, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher C029 */ + { + 1, + TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256, + TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256, + SSL_kECDHe, + SSL_aECDH, + SSL_AES128, + SSL_SHA256, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, 128, 128, }, - /* Cipher C012 */ + /* Cipher C02A */ { 1, - TLS1_TXT_ECDHE_RSA_WITH_DES_192_CBC3_SHA, - TLS1_CK_ECDHE_RSA_WITH_DES_192_CBC3_SHA, - SSL_kEECDH, - SSL_aRSA, - SSL_3DES, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, - 168, - 168, + TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384, + TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384, + SSL_kECDHe, + SSL_aECDH, + SSL_AES256, + SSL_SHA384, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, }, - /* Cipher C013 */ + /* GCM based TLS v1.2 ciphersuites from RFC5289 */ + + /* Cipher C02B */ { 1, - TLS1_TXT_ECDHE_RSA_WITH_AES_128_CBC_SHA, - TLS1_CK_ECDHE_RSA_WITH_AES_128_CBC_SHA, + TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, SSL_kEECDH, - SSL_aRSA, - SSL_AES128, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + SSL_aECDSA, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, 128, 128, }, - /* Cipher C014 */ + /* Cipher C02C */ { 1, - TLS1_TXT_ECDHE_RSA_WITH_AES_256_CBC_SHA, - TLS1_CK_ECDHE_RSA_WITH_AES_256_CBC_SHA, + TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, SSL_kEECDH, - SSL_aRSA, - SSL_AES256, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + SSL_aECDSA, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, 256, 256, }, - /* Cipher C015 */ + /* Cipher C02D */ { 1, - TLS1_TXT_ECDH_anon_WITH_NULL_SHA, - TLS1_CK_ECDH_anon_WITH_NULL_SHA, - SSL_kEECDH, - SSL_aNULL, - SSL_eNULL, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_STRONG_NONE, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, - 0, - 0, + TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256, + SSL_kECDHe, + SSL_aECDH, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, + 128, + 128, }, - /* Cipher C016 */ + /* Cipher C02E */ { 1, - TLS1_TXT_ECDH_anon_WITH_RC4_128_SHA, - TLS1_CK_ECDH_anon_WITH_RC4_128_SHA, + TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384, + SSL_kECDHe, + SSL_aECDH, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, + }, + + /* Cipher C02F */ + { + 1, + TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256, SSL_kEECDH, - SSL_aNULL, - SSL_RC4, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_MEDIUM, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + SSL_aRSA, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, 128, 128, }, - /* Cipher C017 */ + /* Cipher C030 */ { 1, - TLS1_TXT_ECDH_anon_WITH_DES_192_CBC3_SHA, - TLS1_CK_ECDH_anon_WITH_DES_192_CBC3_SHA, + TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384, SSL_kEECDH, - SSL_aNULL, - SSL_3DES, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, - 168, - 168, + SSL_aRSA, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, + 256, + 256, }, - /* Cipher C018 */ + /* Cipher C031 */ { 1, - TLS1_TXT_ECDH_anon_WITH_AES_128_CBC_SHA, - TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA, - SSL_kEECDH, - SSL_aNULL, - SSL_AES128, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256, + TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256, + SSL_kECDHe, + SSL_aECDH, + SSL_AES128GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA256|TLS1_PRF_SHA256, 128, 128, }, - /* Cipher C019 */ + /* Cipher C032 */ { 1, - TLS1_TXT_ECDH_anon_WITH_AES_256_CBC_SHA, - TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA, - SSL_kEECDH, - SSL_aNULL, - SSL_AES256, - SSL_SHA1, - SSL_TLSV1, - SSL_NOT_EXP|SSL_HIGH, - SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF, + TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384, + TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384, + SSL_kECDHe, + SSL_aECDH, + SSL_AES256GCM, + SSL_AEAD, + SSL_TLSV1_2, + SSL_NOT_EXP|SSL_HIGH|SSL_FIPS, + SSL_HANDSHAKE_MAC_SHA384|TLS1_PRF_SHA384, 256, 256, }, -#endif /* OPENSSL_NO_ECDH */ + +#endif /* OPENSSL_NO_ECDH */ + #ifdef TEMP_GOST_TLS /* Cipher FF00 */ @@ -2087,6 +2904,9 @@ SSL3_ENC_METHOD SSLv3_enc_data={ SSL3_MD_CLIENT_FINISHED_CONST,4, SSL3_MD_SERVER_FINISHED_CONST,4, ssl3_alert_code, + (int (*)(SSL *, unsigned char *, size_t, const char *, + size_t, const unsigned char *, size_t, + int use_context))ssl_undefined_function, }; long ssl3_default_timeout(void) @@ -2128,6 +2948,9 @@ int ssl3_new(SSL *s) s->s3=s3; +#ifndef OPENSSL_NO_SRP + SSL_SRP_CTX_init(s); +#endif s->method->ssl_clear(s); return(1); err: @@ -2168,6 +2991,9 @@ void ssl3_free(SSL *s) BIO_free(s->s3->handshake_buffer); } if (s->s3->handshake_dgst) ssl3_free_digest_list(s); +#ifndef OPENSSL_NO_SRP + SSL_SRP_CTX_free(s); +#endif OPENSSL_cleanse(s->s3,sizeof *s->s3); OPENSSL_free(s->s3); s->s3=NULL; @@ -2239,7 +3065,23 @@ void ssl3_clear(SSL *s) s->s3->num_renegotiations=0; s->s3->in_read_app_data=0; s->version=SSL3_VERSION; + +#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) + if (s->next_proto_negotiated) + { + OPENSSL_free(s->next_proto_negotiated); + s->next_proto_negotiated = NULL; + s->next_proto_negotiated_len = 0; + } +#endif + } + +#ifndef OPENSSL_NO_SRP +static char * MS_CALLBACK srp_password_from_info_cb(SSL *s, void *arg) + { + return BUF_strdup(s->srp_ctx.info) ; } +#endif long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg) { @@ -2486,6 +3328,27 @@ long ssl3_ctrl(SSL *s, int cmd, long larg, void *parg) ret = 1; break; +#ifndef OPENSSL_NO_HEARTBEATS + case SSL_CTRL_TLS_EXT_SEND_HEARTBEAT: + if (SSL_version(s) == DTLS1_VERSION || SSL_version(s) == DTLS1_BAD_VER) + ret = dtls1_heartbeat(s); + else + ret = tls1_heartbeat(s); + break; + + case SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING: + ret = s->tlsext_hb_pending; + break; + + case SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS: + if (larg) + s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_RECV_REQUESTS; + else + s->tlsext_heartbeat &= ~SSL_TLSEXT_HB_DONT_RECV_REQUESTS; + ret = 1; + break; +#endif + #endif /* !OPENSSL_NO_TLSEXT */ default: break; @@ -2718,6 +3581,38 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg) return 1; break; +#ifndef OPENSSL_NO_SRP + case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME: + ctx->srp_ctx.srp_Mask|=SSL_kSRP; + if (ctx->srp_ctx.login != NULL) + OPENSSL_free(ctx->srp_ctx.login); + ctx->srp_ctx.login = NULL; + if (parg == NULL) + break; + if (strlen((const char *)parg) > 255 || strlen((const char *)parg) < 1) + { + SSLerr(SSL_F_SSL3_CTX_CTRL, SSL_R_INVALID_SRP_USERNAME); + return 0; + } + if ((ctx->srp_ctx.login = BUF_strdup((char *)parg)) == NULL) + { + SSLerr(SSL_F_SSL3_CTX_CTRL, ERR_R_INTERNAL_ERROR); + return 0; + } + break; + case SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD: + ctx->srp_ctx.SRP_give_srp_client_pwd_callback=srp_password_from_info_cb; + ctx->srp_ctx.info=parg; + break; + case SSL_CTRL_SET_SRP_ARG: + ctx->srp_ctx.srp_Mask|=SSL_kSRP; + ctx->srp_ctx.SRP_cb_arg=parg; + break; + + case SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH: + ctx->srp_ctx.strength=larg; + break; +#endif #endif /* !OPENSSL_NO_TLSEXT */ /* A Thawte special :-) */ @@ -2730,6 +3625,18 @@ long ssl3_ctx_ctrl(SSL_CTX *ctx, int cmd, long larg, void *parg) sk_X509_push(ctx->extra_certs,(X509 *)parg); break; + case SSL_CTRL_GET_EXTRA_CHAIN_CERTS: + *(STACK_OF(X509) **)parg = ctx->extra_certs; + break; + + case SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS: + if (ctx->extra_certs) + { + sk_X509_pop_free(ctx->extra_certs, X509_free); + ctx->extra_certs = NULL; + } + break; + default: return(0); } @@ -2787,6 +3694,20 @@ long ssl3_ctx_callback_ctrl(SSL_CTX *ctx, int cmd, void (*fp)(void)) HMAC_CTX *, int))fp; break; +#ifndef OPENSSL_NO_SRP + case SSL_CTRL_SET_SRP_VERIFY_PARAM_CB: + ctx->srp_ctx.srp_Mask|=SSL_kSRP; + ctx->srp_ctx.SRP_verify_param_callback=(int (*)(SSL *,void *))fp; + break; + case SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB: + ctx->srp_ctx.srp_Mask|=SSL_kSRP; + ctx->srp_ctx.TLS_ext_srp_username_callback=(int (*)(SSL *,int *,void *))fp; + break; + case SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB: + ctx->srp_ctx.srp_Mask|=SSL_kSRP; + ctx->srp_ctx.SRP_give_srp_client_pwd_callback=(char *(*)(SSL *,void *))fp; + break; +#endif #endif default: return(0); @@ -2805,6 +3726,9 @@ const SSL_CIPHER *ssl3_get_cipher_by_char(const unsigned char *p) id=0x03000000L|((unsigned long)p[0]<<8L)|(unsigned long)p[1]; c.id=id; cp = OBJ_bsearch_ssl_cipher_id(&c, ssl3_ciphers, SSL3_NUM_CIPHERS); +#ifdef DEBUG_PRINT_UNKNOWN_CIPHERSUITES +if (cp == NULL) fprintf(stderr, "Unknown cipher ID %x\n", (p[0] << 8) | p[1]); +#endif if (cp == NULL || cp->valid == 0) return NULL; else @@ -2882,11 +3806,20 @@ SSL_CIPHER *ssl3_choose_cipher(SSL *s, STACK_OF(SSL_CIPHER) *clnt, { c=sk_SSL_CIPHER_value(prio,i); + /* Skip TLS v1.2 only ciphersuites if lower than v1.2 */ + if ((c->algorithm_ssl & SSL_TLSV1_2) && + (TLS1_get_version(s) < TLS1_2_VERSION)) + continue; + ssl_set_cert_masks(cert,c); mask_k = cert->mask_k; mask_a = cert->mask_a; emask_k = cert->export_mask_k; emask_a = cert->export_mask_a; +#ifndef OPENSSL_NO_SRP + mask_k=cert->mask_k | s->srp_ctx.srp_Mask; + emask_k=cert->export_mask_k | s->srp_ctx.srp_Mask; +#endif #ifdef KSSL_DEBUG /* printf("ssl3_choose_cipher %d alg= %lx\n", i,c->algorithms);*/ @@ -3335,4 +4268,15 @@ need to go to SSL_ST_ACCEPT. } return(ret); } - +/* If we are using TLS v1.2 or later and default SHA1+MD5 algorithms switch + * to new SHA256 PRF and handshake macs + */ +long ssl_get_algorithm2(SSL *s) + { + long alg2 = s->s3->tmp.new_cipher->algorithm2; + if (TLS1_get_version(s) >= TLS1_2_VERSION && + alg2 == (SSL_HANDSHAKE_MAC_DEFAULT|TLS1_PRF)) + return SSL_HANDSHAKE_MAC_SHA256 | TLS1_PRF_SHA256; + return alg2; + } + diff --git a/src/lib/libssl/s3_pkt.c b/src/lib/libssl/s3_pkt.c index f9b3629cf7..adf8c387cc 100644 --- a/src/lib/libssl/s3_pkt.c +++ b/src/lib/libssl/s3_pkt.c @@ -115,6 +115,7 @@ #include "ssl_locl.h" #include #include +#include static int do_ssl3_write(SSL *s, int type, const unsigned char *buf, unsigned int len, int create_empty_fragment); @@ -630,6 +631,7 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf, unsigned char *p,*plen; int i,mac_size,clear=0; int prefix_len=0; + int eivlen; long align=0; SSL3_RECORD *wr; SSL3_BUFFER *wb=&(s->s3->wbuf); @@ -662,10 +664,14 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf, if ( (sess == NULL) || (s->enc_write_ctx == NULL) || (EVP_MD_CTX_md(s->write_hash) == NULL)) + { +#if 1 + clear=s->enc_write_ctx?0:1; /* must be AEAD cipher */ +#else clear=1; - - if (clear) +#endif mac_size=0; + } else { mac_size=EVP_MD_CTX_size(s->write_hash); @@ -734,14 +740,39 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf, wr->type=type; *(p++)=(s->version>>8); - *(p++)=s->version&0xff; + /* Some servers hang if iniatial client hello is larger than 256 + * bytes and record version number > TLS 1.0 + */ + if (s->state == SSL3_ST_CW_CLNT_HELLO_B + && TLS1_get_version(s) > TLS1_VERSION) + *(p++) = 0x1; + else + *(p++)=s->version&0xff; /* field where we are to write out packet length */ plen=p; p+=2; + /* Explicit IV length, block ciphers and TLS version 1.1 or later */ + if (s->enc_write_ctx && s->version >= TLS1_1_VERSION) + { + int mode = EVP_CIPHER_CTX_mode(s->enc_write_ctx); + if (mode == EVP_CIPH_CBC_MODE) + { + eivlen = EVP_CIPHER_CTX_iv_length(s->enc_write_ctx); + if (eivlen <= 1) + eivlen = 0; + } + /* Need explicit part of IV for GCM mode */ + else if (mode == EVP_CIPH_GCM_MODE) + eivlen = EVP_GCM_TLS_EXPLICIT_IV_LEN; + else + eivlen = 0; + } + else + eivlen = 0; /* lets setup the record stuff. */ - wr->data=p; + wr->data=p + eivlen; wr->length=(int)len; wr->input=(unsigned char *)buf; @@ -769,11 +800,19 @@ static int do_ssl3_write(SSL *s, int type, const unsigned char *buf, if (mac_size != 0) { - if (s->method->ssl3_enc->mac(s,&(p[wr->length]),1) < 0) + if (s->method->ssl3_enc->mac(s,&(p[wr->length + eivlen]),1) < 0) goto err; wr->length+=mac_size; - wr->input=p; - wr->data=p; + } + + wr->input=p; + wr->data=p; + + if (eivlen) + { + /* if (RAND_pseudo_bytes(p, eivlen) <= 0) + goto err; */ + wr->length += eivlen; } /* ssl3_enc can only have an error on read */ @@ -1042,6 +1081,19 @@ start: dest = s->s3->alert_fragment; dest_len = &s->s3->alert_fragment_len; } +#ifndef OPENSSL_NO_HEARTBEATS + else if (rr->type == TLS1_RT_HEARTBEAT) + { + tls1_process_heartbeat(s); + + /* Exit and notify application to read again */ + rr->length = 0; + s->rwstate=SSL_READING; + BIO_clear_retry_flags(SSL_get_rbio(s)); + BIO_set_retry_read(SSL_get_rbio(s)); + return(-1); + } +#endif if (dest_maxlen > 0) { @@ -1185,6 +1237,10 @@ start: SSLerr(SSL_F_SSL3_READ_BYTES,SSL_R_NO_RENEGOTIATION); goto f_err; } +#ifdef SSL_AD_MISSING_SRP_USERNAME + if (alert_descr == SSL_AD_MISSING_SRP_USERNAME) + return(0); +#endif } else if (alert_level == 2) /* fatal */ { @@ -1263,6 +1319,7 @@ start: #else s->state = s->server ? SSL_ST_ACCEPT : SSL_ST_CONNECT; #endif + s->renegotiate=1; s->new_session=1; } i=s->handshake_func(s); @@ -1296,8 +1353,10 @@ start: { default: #ifndef OPENSSL_NO_TLS - /* TLS just ignores unknown message types */ - if (s->version == TLS1_VERSION) + /* TLS up to v1.1 just ignores unknown message types: + * TLS v1.2 give an unexpected message alert. + */ + if (s->version >= TLS1_VERSION && s->version <= TLS1_1_VERSION) { rr->length = 0; goto start; diff --git a/src/lib/libssl/s3_srvr.c b/src/lib/libssl/s3_srvr.c index d734c359fb..118939fabb 100644 --- a/src/lib/libssl/s3_srvr.c +++ b/src/lib/libssl/s3_srvr.c @@ -179,6 +179,31 @@ static const SSL_METHOD *ssl3_get_server_method(int ver) return(NULL); } +#ifndef OPENSSL_NO_SRP +static int ssl_check_srp_ext_ClientHello(SSL *s, int *al) + { + int ret = SSL_ERROR_NONE; + + *al = SSL_AD_UNRECOGNIZED_NAME; + + if ((s->s3->tmp.new_cipher->algorithm_mkey & SSL_kSRP) && + (s->srp_ctx.TLS_ext_srp_username_callback != NULL)) + { + if(s->srp_ctx.login == NULL) + { + /* There isn't any srp login extension !!! */ + ret = SSL3_AL_FATAL; + *al = SSL_AD_UNKNOWN_PSK_IDENTITY; + } + else + { + ret = SSL_srp_server_param_with_username(s,al); + } + } + return ret; + } +#endif + IMPLEMENT_ssl3_meth_func(SSLv3_server_method, ssl3_accept, ssl_undefined_function, @@ -211,6 +236,18 @@ int ssl3_accept(SSL *s) return(-1); } +#ifndef OPENSSL_NO_HEARTBEATS + /* If we're awaiting a HeartbeatResponse, pretend we + * already got and don't await it anymore, because + * Heartbeats don't make sense during handshakes anyway. + */ + if (s->tlsext_hb_pending) + { + s->tlsext_hb_pending = 0; + s->tlsext_hb_seq++; + } +#endif + for (;;) { state=s->state; @@ -218,7 +255,7 @@ int ssl3_accept(SSL *s) switch (s->state) { case SSL_ST_RENEGOTIATE: - s->new_session=1; + s->renegotiate=1; /* s->state=SSL_ST_ACCEPT; */ case SSL_ST_BEFORE: @@ -314,10 +351,34 @@ int ssl3_accept(SSL *s) case SSL3_ST_SR_CLNT_HELLO_C: s->shutdown=0; - ret=ssl3_get_client_hello(s); - if (ret <= 0) goto end; - - s->new_session = 2; + if (s->rwstate != SSL_X509_LOOKUP) + { + ret=ssl3_get_client_hello(s); + if (ret <= 0) goto end; + } +#ifndef OPENSSL_NO_SRP + { + int al; + if ((ret = ssl_check_srp_ext_ClientHello(s,&al)) < 0) + { + /* callback indicates firther work to be done */ + s->rwstate=SSL_X509_LOOKUP; + goto end; + } + if (ret != SSL_ERROR_NONE) + { + ssl3_send_alert(s,SSL3_AL_FATAL,al); + /* This is not really an error but the only means to + for a client to detect whether srp is supported. */ + if (al != TLS1_AD_UNKNOWN_PSK_IDENTITY) + SSLerr(SSL_F_SSL3_ACCEPT,SSL_R_CLIENTHELLO_TLSEXT); + ret = SSL_TLSEXT_ERR_ALERT_FATAL; + ret= -1; + goto end; + } + } +#endif + s->renegotiate = 2; s->state=SSL3_ST_SW_SRVR_HELLO_A; s->init_num=0; break; @@ -346,7 +407,7 @@ int ssl3_accept(SSL *s) case SSL3_ST_SW_CERT_A: case SSL3_ST_SW_CERT_B: /* Check if it is anon DH or anon ECDH, */ - /* normal PSK or KRB5 */ + /* normal PSK or KRB5 or SRP */ if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL) && !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK) && !(s->s3->tmp.new_cipher->algorithm_auth & SSL_aKRB5)) @@ -410,6 +471,10 @@ int ssl3_accept(SSL *s) * hint if provided */ #ifndef OPENSSL_NO_PSK || ((alg_k & SSL_kPSK) && s->ctx->psk_identity_hint) +#endif +#ifndef OPENSSL_NO_SRP + /* SRP: send ServerKeyExchange */ + || (alg_k & SSL_kSRP) #endif || (alg_k & (SSL_kDHr|SSL_kDHd|SSL_kEDH)) || (alg_k & SSL_kEECDH) @@ -457,6 +522,9 @@ int ssl3_accept(SSL *s) skip=1; s->s3->tmp.cert_request=0; s->state=SSL3_ST_SW_SRVR_DONE_A; + if (s->s3->handshake_buffer) + if (!ssl3_digest_cached_records(s)) + return -1; } else { @@ -539,9 +607,34 @@ int ssl3_accept(SSL *s) * the client uses its key from the certificate * for key exchange. */ +#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG) s->state=SSL3_ST_SR_FINISHED_A; +#else + if (s->s3->next_proto_neg_seen) + s->state=SSL3_ST_SR_NEXT_PROTO_A; + else + s->state=SSL3_ST_SR_FINISHED_A; +#endif s->init_num = 0; } + else if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + s->state=SSL3_ST_SR_CERT_VRFY_A; + s->init_num=0; + if (!s->session->peer) + break; + /* For TLS v1.2 freeze the handshake buffer + * at this point and digest cached records. + */ + if (!s->s3->handshake_buffer) + { + SSLerr(SSL_F_SSL3_ACCEPT,ERR_R_INTERNAL_ERROR); + return -1; + } + s->s3->flags |= TLS1_FLAGS_KEEP_HANDSHAKE; + if (!ssl3_digest_cached_records(s)) + return -1; + } else { int offset=0; @@ -582,23 +675,37 @@ int ssl3_accept(SSL *s) ret=ssl3_get_cert_verify(s); if (ret <= 0) goto end; +#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG) s->state=SSL3_ST_SR_FINISHED_A; +#else + if (s->s3->next_proto_neg_seen) + s->state=SSL3_ST_SR_NEXT_PROTO_A; + else + s->state=SSL3_ST_SR_FINISHED_A; +#endif s->init_num=0; break; +#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) + case SSL3_ST_SR_NEXT_PROTO_A: + case SSL3_ST_SR_NEXT_PROTO_B: + ret=ssl3_get_next_proto(s); + if (ret <= 0) goto end; + s->init_num = 0; + s->state=SSL3_ST_SR_FINISHED_A; + break; +#endif + case SSL3_ST_SR_FINISHED_A: case SSL3_ST_SR_FINISHED_B: ret=ssl3_get_finished(s,SSL3_ST_SR_FINISHED_A, SSL3_ST_SR_FINISHED_B); if (ret <= 0) goto end; -#ifndef OPENSSL_NO_TLSEXT - if (s->tlsext_ticket_expected) - s->state=SSL3_ST_SW_SESSION_TICKET_A; - else if (s->hit) - s->state=SSL_ST_OK; -#else if (s->hit) s->state=SSL_ST_OK; +#ifndef OPENSSL_NO_TLSEXT + else if (s->tlsext_ticket_expected) + s->state=SSL3_ST_SW_SESSION_TICKET_A; #endif else s->state=SSL3_ST_SW_CHANGE_A; @@ -656,7 +763,16 @@ int ssl3_accept(SSL *s) if (ret <= 0) goto end; s->state=SSL3_ST_SW_FLUSH; if (s->hit) + { +#if defined(OPENSSL_NO_TLSEXT) || defined(OPENSSL_NO_NEXTPROTONEG) s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A; +#else + if (s->s3->next_proto_neg_seen) + s->s3->tmp.next_state=SSL3_ST_SR_NEXT_PROTO_A; + else + s->s3->tmp.next_state=SSL3_ST_SR_FINISHED_A; +#endif + } else s->s3->tmp.next_state=SSL_ST_OK; s->init_num=0; @@ -674,11 +790,9 @@ int ssl3_accept(SSL *s) s->init_num=0; - if (s->new_session == 2) /* skipped if we just sent a HelloRequest */ + if (s->renegotiate == 2) /* skipped if we just sent a HelloRequest */ { - /* actually not necessarily a 'new' session unless - * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */ - + s->renegotiate=0; s->new_session=0; ssl_update_cache(s,SSL_SESS_CACHE_SERVER); @@ -756,14 +870,6 @@ int ssl3_check_client_hello(SSL *s) int ok; long n; - /* We only allow the client to restart the handshake once per - * negotiation. */ - if (s->s3->flags & SSL3_FLAGS_SGC_RESTART_DONE) - { - SSLerr(SSL_F_SSL3_CHECK_CLIENT_HELLO, SSL_R_MULTIPLE_SGC_RESTARTS); - return -1; - } - /* this function is called when we really expect a Certificate message, * so permit appropriate message length */ n=s->method->ssl_get_message(s, @@ -776,6 +882,13 @@ int ssl3_check_client_hello(SSL *s) s->s3->tmp.reuse_message = 1; if (s->s3->tmp.message_type == SSL3_MT_CLIENT_HELLO) { + /* We only allow the client to restart the handshake once per + * negotiation. */ + if (s->s3->flags & SSL3_FLAGS_SGC_RESTART_DONE) + { + SSLerr(SSL_F_SSL3_CHECK_CLIENT_HELLO, SSL_R_MULTIPLE_SGC_RESTARTS); + return -1; + } /* Throw away what we have done so far in the current handshake, * which will now be aborted. (A full SSL_clear would be too much.) */ #ifndef OPENSSL_NO_DH @@ -817,7 +930,8 @@ int ssl3_get_client_hello(SSL *s) * If we are SSLv3, we will respond with SSLv3, even if prompted with * TLSv1. */ - if (s->state == SSL3_ST_SR_CLNT_HELLO_A) + if (s->state == SSL3_ST_SR_CLNT_HELLO_A + ) { s->state=SSL3_ST_SR_CLNT_HELLO_B; } @@ -874,13 +988,16 @@ int ssl3_get_client_hello(SSL *s) j= *(p++); s->hit=0; - /* Versions before 0.9.7 always allow session reuse during renegotiation - * (i.e. when s->new_session is true), option - * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is new with 0.9.7. - * Maybe this optional behaviour should always have been the default, - * but we cannot safely change the default behaviour (or new applications - * might be written that become totally unsecure when compiled with - * an earlier library version) + /* Versions before 0.9.7 always allow clients to resume sessions in renegotiation. + * 0.9.7 and later allow this by default, but optionally ignore resumption requests + * with flag SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION (it's a new flag rather + * than a change to default behavior so that applications relying on this for security + * won't even compile against older library versions). + * + * 1.0.1 and later also have a function SSL_renegotiate_abbreviated() to request + * renegotiation but not a new session (s->new_session remains unset): for servers, + * this essentially just means that the SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION + * setting will be ignored. */ if ((s->new_session && (s->options & SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION))) { @@ -1269,8 +1386,11 @@ int ssl3_get_client_hello(SSL *s) s->s3->tmp.new_cipher=s->session->cipher; } - if (!ssl3_digest_cached_records(s)) - goto f_err; + if (TLS1_get_version(s) < TLS1_2_VERSION || !(s->verify_mode & SSL_VERIFY_PEER)) + { + if (!ssl3_digest_cached_records(s)) + goto f_err; + } /* we now have the following setup. * client_random @@ -1325,20 +1445,20 @@ int ssl3_send_server_hello(SSL *s) memcpy(p,s->s3->server_random,SSL3_RANDOM_SIZE); p+=SSL3_RANDOM_SIZE; - /* now in theory we have 3 options to sending back the - * session id. If it is a re-use, we send back the - * old session-id, if it is a new session, we send - * back the new session-id or we send back a 0 length - * session-id if we want it to be single use. - * Currently I will not implement the '0' length session-id - * 12-Jan-98 - I'll now support the '0' length stuff. - * - * We also have an additional case where stateless session - * resumption is successful: we always send back the old - * session id. In this case s->hit is non zero: this can - * only happen if stateless session resumption is succesful - * if session caching is disabled so existing functionality - * is unaffected. + /* There are several cases for the session ID to send + * back in the server hello: + * - For session reuse from the session cache, + * we send back the old session ID. + * - If stateless session reuse (using a session ticket) + * is successful, we send back the client's "session ID" + * (which doesn't actually identify the session). + * - If it is a new session, we send back the new + * session ID. + * - However, if we want the new session to be single-use, + * we send back a 0-length session ID. + * s->hit is non-zero in either case of session reuse, + * so the following won't overwrite an ID that we're supposed + * to send back. */ if (!(s->ctx->session_cache_mode & SSL_SESS_CACHE_SERVER) && !s->hit) @@ -1439,6 +1559,7 @@ int ssl3_send_server_key_exchange(SSL *s) BN_CTX *bn_ctx = NULL; #endif EVP_PKEY *pkey; + const EVP_MD *md = NULL; unsigned char *p,*d; int al,i; unsigned long type; @@ -1679,21 +1800,44 @@ int ssl3_send_server_key_exchange(SSL *s) } else #endif /* !OPENSSL_NO_PSK */ +#ifndef OPENSSL_NO_SRP + if (type & SSL_kSRP) + { + if ((s->srp_ctx.N == NULL) || + (s->srp_ctx.g == NULL) || + (s->srp_ctx.s == NULL) || + (s->srp_ctx.B == NULL)) + { + SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_MISSING_SRP_PARAM); + goto err; + } + r[0]=s->srp_ctx.N; + r[1]=s->srp_ctx.g; + r[2]=s->srp_ctx.s; + r[3]=s->srp_ctx.B; + } + else +#endif { al=SSL_AD_HANDSHAKE_FAILURE; SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE); goto f_err; } - for (i=0; r[i] != NULL; i++) + for (i=0; r[i] != NULL && i<4; i++) { nr[i]=BN_num_bytes(r[i]); +#ifndef OPENSSL_NO_SRP + if ((i == 2) && (type & SSL_kSRP)) + n+=1+nr[i]; + else +#endif n+=2+nr[i]; } if (!(s->s3->tmp.new_cipher->algorithm_auth & SSL_aNULL) && !(s->s3->tmp.new_cipher->algorithm_mkey & SSL_kPSK)) { - if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher)) + if ((pkey=ssl_get_sign_pkey(s,s->s3->tmp.new_cipher,&md)) == NULL) { al=SSL_AD_DECODE_ERROR; @@ -1715,8 +1859,16 @@ int ssl3_send_server_key_exchange(SSL *s) d=(unsigned char *)s->init_buf->data; p= &(d[4]); - for (i=0; r[i] != NULL; i++) + for (i=0; r[i] != NULL && i<4; i++) { +#ifndef OPENSSL_NO_SRP + if ((i == 2) && (type & SSL_kSRP)) + { + *p = nr[i]; + p++; + } + else +#endif s2n(nr[i],p); BN_bn2bin(r[i],p); p+=nr[i]; @@ -1764,12 +1916,15 @@ int ssl3_send_server_key_exchange(SSL *s) /* n is the length of the params, they start at &(d[4]) * and p points to the space at the end. */ #ifndef OPENSSL_NO_RSA - if (pkey->type == EVP_PKEY_RSA) + if (pkey->type == EVP_PKEY_RSA + && TLS1_get_version(s) < TLS1_2_VERSION) { q=md_buf; j=0; for (num=2; num > 0; num--) { + EVP_MD_CTX_set_flags(&md_ctx, + EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); EVP_DigestInit_ex(&md_ctx,(num == 2) ?s->ctx->md5:s->ctx->sha1, NULL); EVP_DigestUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE); @@ -1791,44 +1946,41 @@ int ssl3_send_server_key_exchange(SSL *s) } else #endif -#if !defined(OPENSSL_NO_DSA) - if (pkey->type == EVP_PKEY_DSA) + if (md) { - /* lets do DSS */ - EVP_SignInit_ex(&md_ctx,EVP_dss1(), NULL); - EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE); - EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE); - EVP_SignUpdate(&md_ctx,&(d[4]),n); - if (!EVP_SignFinal(&md_ctx,&(p[2]), - (unsigned int *)&i,pkey)) + /* For TLS1.2 and later send signature + * algorithm */ + if (TLS1_get_version(s) >= TLS1_2_VERSION) { - SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_DSA); - goto err; + if (!tls12_get_sigandhash(p, pkey, md)) + { + /* Should never happen */ + al=SSL_AD_INTERNAL_ERROR; + SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR); + goto f_err; + } + p+=2; } - s2n(i,p); - n+=i+2; - } - else +#ifdef SSL_DEBUG + fprintf(stderr, "Using hash %s\n", + EVP_MD_name(md)); #endif -#if !defined(OPENSSL_NO_ECDSA) - if (pkey->type == EVP_PKEY_EC) - { - /* let's do ECDSA */ - EVP_SignInit_ex(&md_ctx,EVP_ecdsa(), NULL); + EVP_SignInit_ex(&md_ctx, md, NULL); EVP_SignUpdate(&md_ctx,&(s->s3->client_random[0]),SSL3_RANDOM_SIZE); EVP_SignUpdate(&md_ctx,&(s->s3->server_random[0]),SSL3_RANDOM_SIZE); EVP_SignUpdate(&md_ctx,&(d[4]),n); if (!EVP_SignFinal(&md_ctx,&(p[2]), (unsigned int *)&i,pkey)) { - SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_ECDSA); + SSLerr(SSL_F_SSL3_SEND_SERVER_KEY_EXCHANGE,ERR_LIB_EVP); goto err; } s2n(i,p); n+=i+2; + if (TLS1_get_version(s) >= TLS1_2_VERSION) + n+= 2; } else -#endif { /* Is this error check actually needed? */ al=SSL_AD_HANDSHAKE_FAILURE; @@ -1881,6 +2033,14 @@ int ssl3_send_certificate_request(SSL *s) p+=n; n++; + if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + nl = tls12_get_req_sig_algs(s, p + 2); + s2n(nl, p); + p += nl + 2; + n += nl + 2; + } + off=n; p+=2; n+=2; @@ -2600,6 +2760,44 @@ int ssl3_get_client_key_exchange(SSL *s) } else #endif +#ifndef OPENSSL_NO_SRP + if (alg_k & SSL_kSRP) + { + int param_len; + + n2s(p,i); + param_len=i+2; + if (param_len > n) + { + al=SSL_AD_DECODE_ERROR; + SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,SSL_R_BAD_SRP_A_LENGTH); + goto f_err; + } + if (!(s->srp_ctx.A=BN_bin2bn(p,i,NULL))) + { + SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_BN_LIB); + goto err; + } + if (s->session->srp_username != NULL) + OPENSSL_free(s->session->srp_username); + s->session->srp_username = BUF_strdup(s->srp_ctx.login); + if (s->session->srp_username == NULL) + { + SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE, + ERR_R_MALLOC_FAILURE); + goto err; + } + + if ((s->session->master_key_length = SRP_generate_server_master_secret(s,s->session->master_key))<0) + { + SSLerr(SSL_F_SSL3_GET_CLIENT_KEY_EXCHANGE,ERR_R_INTERNAL_ERROR); + goto err; + } + + p+=i; + } + else +#endif /* OPENSSL_NO_SRP */ if (alg_k & SSL_kGOST) { int ret = 0; @@ -2683,7 +2881,7 @@ int ssl3_get_client_key_exchange(SSL *s) return(1); f_err: ssl3_send_alert(s,SSL3_AL_FATAL,al); -#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH) +#if !defined(OPENSSL_NO_DH) || !defined(OPENSSL_NO_RSA) || !defined(OPENSSL_NO_ECDH) || defined(OPENSSL_NO_SRP) err: #endif #ifndef OPENSSL_NO_ECDH @@ -2704,12 +2902,15 @@ int ssl3_get_cert_verify(SSL *s) long n; int type=0,i,j; X509 *peer; + const EVP_MD *md = NULL; + EVP_MD_CTX mctx; + EVP_MD_CTX_init(&mctx); n=s->method->ssl_get_message(s, SSL3_ST_SR_CERT_VRFY_A, SSL3_ST_SR_CERT_VRFY_B, -1, - 514, /* 514? */ + 516, /* Enough for 4096 bit RSA key with TLS v1.2 */ &ok); if (!ok) return((int)n); @@ -2729,7 +2930,7 @@ int ssl3_get_cert_verify(SSL *s) if (s->s3->tmp.message_type != SSL3_MT_CERTIFICATE_VERIFY) { s->s3->tmp.reuse_message=1; - if ((peer != NULL) && (type | EVP_PKT_SIGN)) + if ((peer != NULL) && (type & EVP_PKT_SIGN)) { al=SSL_AD_UNEXPECTED_MESSAGE; SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_MISSING_VERIFY_MESSAGE); @@ -2772,6 +2973,36 @@ int ssl3_get_cert_verify(SSL *s) } else { + if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + int sigalg = tls12_get_sigid(pkey); + /* Should never happen */ + if (sigalg == -1) + { + SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,ERR_R_INTERNAL_ERROR); + al=SSL_AD_INTERNAL_ERROR; + goto f_err; + } + /* Check key type is consistent with signature */ + if (sigalg != (int)p[1]) + { + SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_WRONG_SIGNATURE_TYPE); + al=SSL_AD_DECODE_ERROR; + goto f_err; + } + md = tls12_get_hash(p[0]); + if (md == NULL) + { + SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_UNKNOWN_DIGEST); + al=SSL_AD_DECODE_ERROR; + goto f_err; + } +#ifdef SSL_DEBUG +fprintf(stderr, "USING TLSv1.2 HASH %s\n", EVP_MD_name(md)); +#endif + p += 2; + n -= 2; + } n2s(p,i); n-=2; if (i > n) @@ -2789,6 +3020,37 @@ int ssl3_get_cert_verify(SSL *s) goto f_err; } + if (TLS1_get_version(s) >= TLS1_2_VERSION) + { + long hdatalen = 0; + void *hdata; + hdatalen = BIO_get_mem_data(s->s3->handshake_buffer, &hdata); + if (hdatalen <= 0) + { + SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_INTERNAL_ERROR); + al=SSL_AD_INTERNAL_ERROR; + goto f_err; + } +#ifdef SSL_DEBUG + fprintf(stderr, "Using TLS 1.2 with client verify alg %s\n", + EVP_MD_name(md)); +#endif + if (!EVP_VerifyInit_ex(&mctx, md, NULL) + || !EVP_VerifyUpdate(&mctx, hdata, hdatalen)) + { + SSLerr(SSL_F_SSL3_GET_CERT_VERIFY, ERR_R_EVP_LIB); + al=SSL_AD_INTERNAL_ERROR; + goto f_err; + } + + if (EVP_VerifyFinal(&mctx, p , i, pkey) <= 0) + { + al=SSL_AD_DECRYPT_ERROR; + SSLerr(SSL_F_SSL3_GET_CERT_VERIFY,SSL_R_BAD_SIGNATURE); + goto f_err; + } + } + else #ifndef OPENSSL_NO_RSA if (pkey->type == EVP_PKEY_RSA) { @@ -2879,6 +3141,13 @@ f_err: ssl3_send_alert(s,SSL3_AL_FATAL,al); } end: + if (s->s3->handshake_buffer) + { + BIO_free(s->s3->handshake_buffer); + s->s3->handshake_buffer = NULL; + s->s3->flags &= ~TLS1_FLAGS_KEEP_HANDSHAKE; + } + EVP_MD_CTX_cleanup(&mctx); EVP_PKEY_free(pkey); return(ret); } @@ -2991,6 +3260,12 @@ int ssl3_get_client_certificate(SSL *s) al=SSL_AD_HANDSHAKE_FAILURE; goto f_err; } + /* No client certificate so digest cached records */ + if (s->s3->handshake_buffer && !ssl3_digest_cached_records(s)) + { + al=SSL_AD_INTERNAL_ERROR; + goto f_err; + } } else { @@ -3067,13 +3342,17 @@ int ssl3_send_server_certificate(SSL *s) /* SSL3_ST_SW_CERT_B */ return(ssl3_do_write(s,SSL3_RT_HANDSHAKE)); } + #ifndef OPENSSL_NO_TLSEXT +/* send a new session ticket (not necessarily for a new session) */ int ssl3_send_newsession_ticket(SSL *s) { if (s->state == SSL3_ST_SW_SESSION_TICKET_A) { unsigned char *p, *senc, *macstart; - int len, slen; + const unsigned char *const_p; + int len, slen_full, slen; + SSL_SESSION *sess; unsigned int hlen; EVP_CIPHER_CTX ctx; HMAC_CTX hctx; @@ -3082,12 +3361,38 @@ int ssl3_send_newsession_ticket(SSL *s) unsigned char key_name[16]; /* get session encoding length */ - slen = i2d_SSL_SESSION(s->session, NULL); + slen_full = i2d_SSL_SESSION(s->session, NULL); /* Some length values are 16 bits, so forget it if session is * too long */ - if (slen > 0xFF00) + if (slen_full > 0xFF00) + return -1; + senc = OPENSSL_malloc(slen_full); + if (!senc) + return -1; + p = senc; + i2d_SSL_SESSION(s->session, &p); + + /* create a fresh copy (not shared with other threads) to clean up */ + const_p = senc; + sess = d2i_SSL_SESSION(NULL, &const_p, slen_full); + if (sess == NULL) + { + OPENSSL_free(senc); return -1; + } + sess->session_id_length = 0; /* ID is irrelevant for the ticket */ + + slen = i2d_SSL_SESSION(sess, NULL); + if (slen > slen_full) /* shouldn't ever happen */ + { + OPENSSL_free(senc); + return -1; + } + p = senc; + i2d_SSL_SESSION(sess, &p); + SSL_SESSION_free(sess); + /* Grow buffer if need be: the length calculation is as * follows 1 (size of message name) + 3 (message length * bytes) + 4 (ticket lifetime hint) + 2 (ticket length) + @@ -3099,11 +3404,6 @@ int ssl3_send_newsession_ticket(SSL *s) 26 + EVP_MAX_IV_LENGTH + EVP_MAX_BLOCK_LENGTH + EVP_MAX_MD_SIZE + slen)) return -1; - senc = OPENSSL_malloc(slen); - if (!senc) - return -1; - p = senc; - i2d_SSL_SESSION(s->session, &p); p=(unsigned char *)s->init_buf->data; /* do the header */ @@ -3134,7 +3434,13 @@ int ssl3_send_newsession_ticket(SSL *s) tlsext_tick_md(), NULL); memcpy(key_name, tctx->tlsext_tick_key_name, 16); } - l2n(s->session->tlsext_tick_lifetime_hint, p); + + /* Ticket lifetime hint (advisory only): + * We leave this unspecified for resumed session (for simplicity), + * and guess that tickets for new sessions will live as long + * as their sessions. */ + l2n(s->hit ? 0 : s->session->timeout, p); + /* Skip ticket length for now */ p += 2; /* Output key name */ @@ -3209,4 +3515,72 @@ int ssl3_send_cert_status(SSL *s) /* SSL3_ST_SW_CERT_STATUS_B */ return(ssl3_do_write(s,SSL3_RT_HANDSHAKE)); } + +# ifndef OPENSSL_NO_NEXTPROTONEG +/* ssl3_get_next_proto reads a Next Protocol Negotiation handshake message. It + * sets the next_proto member in s if found */ +int ssl3_get_next_proto(SSL *s) + { + int ok; + int proto_len, padding_len; + long n; + const unsigned char *p; + + /* Clients cannot send a NextProtocol message if we didn't see the + * extension in their ClientHello */ + if (!s->s3->next_proto_neg_seen) + { + SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION); + return -1; + } + + n=s->method->ssl_get_message(s, + SSL3_ST_SR_NEXT_PROTO_A, + SSL3_ST_SR_NEXT_PROTO_B, + SSL3_MT_NEXT_PROTO, + 514, /* See the payload format below */ + &ok); + + if (!ok) + return((int)n); + + /* s->state doesn't reflect whether ChangeCipherSpec has been received + * in this handshake, but s->s3->change_cipher_spec does (will be reset + * by ssl3_get_finished). */ + if (!s->s3->change_cipher_spec) + { + SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS); + return -1; + } + + if (n < 2) + return 0; /* The body must be > 1 bytes long */ + + p=(unsigned char *)s->init_msg; + + /* The payload looks like: + * uint8 proto_len; + * uint8 proto[proto_len]; + * uint8 padding_len; + * uint8 padding[padding_len]; + */ + proto_len = p[0]; + if (proto_len + 2 > s->init_num) + return 0; + padding_len = p[proto_len + 1]; + if (proto_len + padding_len + 2 != s->init_num) + return 0; + + s->next_proto_negotiated = OPENSSL_malloc(proto_len); + if (!s->next_proto_negotiated) + { + SSLerr(SSL_F_SSL3_GET_NEXT_PROTO,ERR_R_MALLOC_FAILURE); + return 0; + } + memcpy(s->next_proto_negotiated, p + 1, proto_len); + s->next_proto_negotiated_len = proto_len; + + return 1; + } +# endif #endif diff --git a/src/lib/libssl/srtp.h b/src/lib/libssl/srtp.h new file mode 100644 index 0000000000..c0cf33ef28 --- /dev/null +++ b/src/lib/libssl/srtp.h @@ -0,0 +1,145 @@ +/* ssl/tls1.h */ +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ +/* ==================================================================== + * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ +/* + DTLS code by Eric Rescorla + + Copyright (C) 2006, Network Resonance, Inc. + Copyright (C) 2011, RTFM, Inc. +*/ + +#ifndef HEADER_D1_SRTP_H +#define HEADER_D1_SRTP_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#define SRTP_AES128_CM_SHA1_80 0x0001 +#define SRTP_AES128_CM_SHA1_32 0x0002 +#define SRTP_AES128_F8_SHA1_80 0x0003 +#define SRTP_AES128_F8_SHA1_32 0x0004 +#define SRTP_NULL_SHA1_80 0x0005 +#define SRTP_NULL_SHA1_32 0x0006 + +int SSL_CTX_set_tlsext_use_srtp(SSL_CTX *ctx, const char *profiles); +int SSL_set_tlsext_use_srtp(SSL *ctx, const char *profiles); +SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); + +STACK_OF(SRTP_PROTECTION_PROFILE) *SSL_get_srtp_profiles(SSL *ssl); +SRTP_PROTECTION_PROFILE *SSL_get_selected_srtp_profile(SSL *s); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/src/lib/libssl/ssl.h b/src/lib/libssl/ssl.h index 8f922eea72..8b0c2a2dac 100644 --- a/src/lib/libssl/ssl.h +++ b/src/lib/libssl/ssl.h @@ -252,6 +252,7 @@ extern "C" { #define SSL_TXT_kEECDH "kEECDH" #define SSL_TXT_kPSK "kPSK" #define SSL_TXT_kGOST "kGOST" +#define SSL_TXT_kSRP "kSRP" #define SSL_TXT_aRSA "aRSA" #define SSL_TXT_aDSS "aDSS" @@ -275,6 +276,7 @@ extern "C" { #define SSL_TXT_ECDSA "ECDSA" #define SSL_TXT_KRB5 "KRB5" #define SSL_TXT_PSK "PSK" +#define SSL_TXT_SRP "SRP" #define SSL_TXT_DES "DES" #define SSL_TXT_3DES "3DES" @@ -285,6 +287,7 @@ extern "C" { #define SSL_TXT_AES128 "AES128" #define SSL_TXT_AES256 "AES256" #define SSL_TXT_AES "AES" +#define SSL_TXT_AES_GCM "AESGCM" #define SSL_TXT_CAMELLIA128 "CAMELLIA128" #define SSL_TXT_CAMELLIA256 "CAMELLIA256" #define SSL_TXT_CAMELLIA "CAMELLIA" @@ -294,10 +297,14 @@ extern "C" { #define SSL_TXT_SHA "SHA" /* same as "SHA1" */ #define SSL_TXT_GOST94 "GOST94" #define SSL_TXT_GOST89MAC "GOST89MAC" +#define SSL_TXT_SHA256 "SHA256" +#define SSL_TXT_SHA384 "SHA384" #define SSL_TXT_SSLV2 "SSLv2" #define SSL_TXT_SSLV3 "SSLv3" #define SSL_TXT_TLSV1 "TLSv1" +#define SSL_TXT_TLSV1_1 "TLSv1.1" +#define SSL_TXT_TLSV1_2 "TLSv1.2" #define SSL_TXT_EXP "EXP" #define SSL_TXT_EXPORT "EXPORT" @@ -356,9 +363,29 @@ extern "C" { * in SSL_CTX. */ typedef struct ssl_st *ssl_crock_st; typedef struct tls_session_ticket_ext_st TLS_SESSION_TICKET_EXT; +typedef struct ssl_method_st SSL_METHOD; +typedef struct ssl_cipher_st SSL_CIPHER; +typedef struct ssl_session_st SSL_SESSION; + +DECLARE_STACK_OF(SSL_CIPHER) + +/* SRTP protection profiles for use with the use_srtp extension (RFC 5764)*/ +typedef struct srtp_protection_profile_st + { + const char *name; + unsigned long id; + } SRTP_PROTECTION_PROFILE; + +DECLARE_STACK_OF(SRTP_PROTECTION_PROFILE) + +typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg); +typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg); + + +#ifndef OPENSSL_NO_SSL_INTERN /* used to hold info on the particular ciphers used */ -typedef struct ssl_cipher_st +struct ssl_cipher_st { int valid; const char *name; /* text name */ @@ -375,15 +402,11 @@ typedef struct ssl_cipher_st unsigned long algorithm2; /* Extra flags */ int strength_bits; /* Number of bits really used */ int alg_bits; /* Number of bits for algorithm */ - } SSL_CIPHER; - -DECLARE_STACK_OF(SSL_CIPHER) + }; -typedef int (*tls_session_ticket_ext_cb_fn)(SSL *s, const unsigned char *data, int len, void *arg); -typedef int (*tls_session_secret_cb_fn)(SSL *s, void *secret, int *secret_len, STACK_OF(SSL_CIPHER) *peer_ciphers, SSL_CIPHER **cipher, void *arg); /* Used to hold functions for SSLv2 or SSLv3/TLSv1 functions */ -typedef struct ssl_method_st +struct ssl_method_st { int version; int (*ssl_new)(SSL *s); @@ -416,7 +439,7 @@ typedef struct ssl_method_st int (*ssl_version)(void); long (*ssl_callback_ctrl)(SSL *s, int cb_id, void (*fp)(void)); long (*ssl_ctx_callback_ctrl)(SSL_CTX *s, int cb_id, void (*fp)(void)); - } SSL_METHOD; + }; /* Lets make this into an ASN.1 type structure as follows * SSL_SESSION_ID ::= SEQUENCE { @@ -433,14 +456,17 @@ typedef struct ssl_method_st * Session_ID_context [ 4 ] EXPLICIT OCTET STRING, -- the Session ID context * Verify_result [ 5 ] EXPLICIT INTEGER, -- X509_V_... code for `Peer' * HostName [ 6 ] EXPLICIT OCTET STRING, -- optional HostName from servername TLS extension - * ECPointFormatList [ 7 ] OCTET STRING, -- optional EC point format list from TLS extension - * PSK_identity_hint [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity hint - * PSK_identity [ 9 ] EXPLICIT OCTET STRING -- optional PSK identity + * PSK_identity_hint [ 7 ] EXPLICIT OCTET STRING, -- optional PSK identity hint + * PSK_identity [ 8 ] EXPLICIT OCTET STRING, -- optional PSK identity + * Ticket_lifetime_hint [9] EXPLICIT INTEGER, -- server's lifetime hint for session ticket + * Ticket [10] EXPLICIT OCTET STRING, -- session ticket (clients only) + * Compression_meth [11] EXPLICIT OCTET STRING, -- optional compression method + * SRP_username [ 12 ] EXPLICIT OCTET STRING -- optional SRP username * } * Look in ssl/ssl_asn1.c for more details * I'm using EXPLICIT tags so I can read the damn things using asn1parse :-). */ -typedef struct ssl_session_st +struct ssl_session_st { int ssl_version; /* what ssl version session info is * being kept in here? */ @@ -512,8 +538,12 @@ typedef struct ssl_session_st size_t tlsext_ticklen; /* Session ticket length */ long tlsext_tick_lifetime_hint; /* Session lifetime hint in seconds */ #endif - } SSL_SESSION; +#ifndef OPENSSL_NO_SRP + char *srp_username; +#endif + }; +#endif #define SSL_OP_MICROSOFT_SESS_ID_BUG 0x00000001L #define SSL_OP_NETSCAPE_CHALLENGE_BUG 0x00000002L @@ -536,7 +566,7 @@ typedef struct ssl_session_st /* SSL_OP_ALL: various bug workarounds that should be rather harmless. * This used to be 0x000FFFFFL before 0.9.7. */ -#define SSL_OP_ALL 0x80000FFFL +#define SSL_OP_ALL 0x80000BFFL /* DTLS options */ #define SSL_OP_NO_QUERY_MTU 0x00001000L @@ -572,11 +602,17 @@ typedef struct ssl_session_st #define SSL_OP_NO_SSLv2 0x01000000L #define SSL_OP_NO_SSLv3 0x02000000L #define SSL_OP_NO_TLSv1 0x04000000L +#define SSL_OP_NO_TLSv1_2 0x08000000L +#define SSL_OP_NO_TLSv1_1 0x10000000L +/* These next two were never actually used for anything since SSLeay + * zap so we have some more flags. + */ /* The next flag deliberately changes the ciphertest, this is a check * for the PKCS#1 attack */ -#define SSL_OP_PKCS1_CHECK_1 0x08000000L -#define SSL_OP_PKCS1_CHECK_2 0x10000000L +#define SSL_OP_PKCS1_CHECK_1 0x0 +#define SSL_OP_PKCS1_CHECK_2 0x0 + #define SSL_OP_NETSCAPE_CA_DN_BUG 0x20000000L #define SSL_OP_NETSCAPE_DEMO_CIPHER_CHANGE_BUG 0x40000000L /* Make server add server-hello extension from early version of @@ -637,12 +673,53 @@ typedef struct ssl_session_st #define SSL_get_secure_renegotiation_support(ssl) \ SSL_ctrl((ssl), SSL_CTRL_GET_RI_SUPPORT, 0, NULL) +#ifndef OPENSSL_NO_HEARTBEATS +#define SSL_heartbeat(ssl) \ + SSL_ctrl((ssl),SSL_CTRL_TLS_EXT_SEND_HEARTBEAT,0,NULL) +#endif + void SSL_CTX_set_msg_callback(SSL_CTX *ctx, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg)); void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int content_type, const void *buf, size_t len, SSL *ssl, void *arg)); #define SSL_CTX_set_msg_callback_arg(ctx, arg) SSL_CTX_ctrl((ctx), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg)) #define SSL_set_msg_callback_arg(ssl, arg) SSL_ctrl((ssl), SSL_CTRL_SET_MSG_CALLBACK_ARG, 0, (arg)) +#ifndef OPENSSL_NO_SRP +#ifndef OPENSSL_NO_SSL_INTERN + +typedef struct srp_ctx_st + { + /* param for all the callbacks */ + void *SRP_cb_arg; + /* set client Hello login callback */ + int (*TLS_ext_srp_username_callback)(SSL *, int *, void *); + /* set SRP N/g param callback for verification */ + int (*SRP_verify_param_callback)(SSL *, void *); + /* set SRP client passwd callback */ + char *(*SRP_give_srp_client_pwd_callback)(SSL *, void *); + + char *login; + BIGNUM *N,*g,*s,*B,*A; + BIGNUM *a,*b,*v; + char *info; + int strength; + + unsigned long srp_Mask; + } SRP_CTX; + +#endif + +/* see tls_srp.c */ +int SSL_SRP_CTX_init(SSL *s); +int SSL_CTX_SRP_CTX_init(SSL_CTX *ctx); +int SSL_SRP_CTX_free(SSL *ctx); +int SSL_CTX_SRP_CTX_free(SSL_CTX *ctx); +int SSL_srp_server_param_with_username(SSL *s, int *ad); +int SRP_generate_server_master_secret(SSL *s,unsigned char *master_key); +int SRP_Calc_A_param(SSL *s); +int SRP_generate_client_master_secret(SSL *s,unsigned char *master_key); + +#endif #if defined(OPENSSL_SYS_MSDOS) && !defined(OPENSSL_SYS_WIN32) #define SSL_MAX_CERT_LIST_DEFAULT 1024*30 /* 30k max cert list :-) */ @@ -668,7 +745,11 @@ void SSL_set_msg_callback(SSL *ssl, void (*cb)(int write_p, int version, int con typedef int (*GEN_SESSION_CB)(const SSL *ssl, unsigned char *id, unsigned int *id_len); -typedef struct ssl_comp_st +typedef struct ssl_comp_st SSL_COMP; + +#ifndef OPENSSL_NO_SSL_INTERN + +struct ssl_comp_st { int id; const char *name; @@ -677,7 +758,7 @@ typedef struct ssl_comp_st #else char *method; #endif - } SSL_COMP; + }; DECLARE_STACK_OF(SSL_COMP) DECLARE_LHASH_OF(SSL_SESSION); @@ -846,7 +927,6 @@ struct ssl_ctx_st /* Callback for status request */ int (*tlsext_status_cb)(SSL *ssl, void *arg); void *tlsext_status_arg; - /* draft-rescorla-tls-opaque-prf-input-00.txt information */ int (*tlsext_opaque_prf_input_callback)(SSL *, void *peerinput, size_t len, void *arg); void *tlsext_opaque_prf_input_callback_arg; @@ -866,9 +946,37 @@ struct ssl_ctx_st unsigned int freelist_max_len; struct ssl3_buf_freelist_st *wbuf_freelist; struct ssl3_buf_freelist_st *rbuf_freelist; +#endif +#ifndef OPENSSL_NO_SRP + SRP_CTX srp_ctx; /* ctx for SRP authentication */ +#endif + +#ifndef OPENSSL_NO_TLSEXT +# ifndef OPENSSL_NO_NEXTPROTONEG + /* Next protocol negotiation information */ + /* (for experimental NPN extension). */ + + /* For a server, this contains a callback function by which the set of + * advertised protocols can be provided. */ + int (*next_protos_advertised_cb)(SSL *s, const unsigned char **buf, + unsigned int *len, void *arg); + void *next_protos_advertised_cb_arg; + /* For a client, this contains a callback function that selects the + * next protocol from the list provided by the server. */ + int (*next_proto_select_cb)(SSL *s, unsigned char **out, + unsigned char *outlen, + const unsigned char *in, + unsigned int inlen, + void *arg); + void *next_proto_select_cb_arg; +# endif + /* SRTP profiles we are willing to do from RFC 5764 */ + STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles; #endif }; +#endif + #define SSL_SESS_CACHE_OFF 0x0000 #define SSL_SESS_CACHE_CLIENT 0x0001 #define SSL_SESS_CACHE_SERVER 0x0002 @@ -921,6 +1029,32 @@ int SSL_CTX_set_client_cert_engine(SSL_CTX *ctx, ENGINE *e); #endif void SSL_CTX_set_cookie_generate_cb(SSL_CTX *ctx, int (*app_gen_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int *cookie_len)); void SSL_CTX_set_cookie_verify_cb(SSL_CTX *ctx, int (*app_verify_cookie_cb)(SSL *ssl, unsigned char *cookie, unsigned int cookie_len)); +#ifndef OPENSSL_NO_NEXTPROTONEG +void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *s, + int (*cb) (SSL *ssl, + const unsigned char **out, + unsigned int *outlen, + void *arg), + void *arg); +void SSL_CTX_set_next_proto_select_cb(SSL_CTX *s, + int (*cb) (SSL *ssl, + unsigned char **out, + unsigned char *outlen, + const unsigned char *in, + unsigned int inlen, + void *arg), + void *arg); + +int SSL_select_next_proto(unsigned char **out, unsigned char *outlen, + const unsigned char *in, unsigned int inlen, + const unsigned char *client, unsigned int client_len); +void SSL_get0_next_proto_negotiated(const SSL *s, + const unsigned char **data, unsigned *len); + +#define OPENSSL_NPN_UNSUPPORTED 0 +#define OPENSSL_NPN_NEGOTIATED 1 +#define OPENSSL_NPN_NO_OVERLAP 2 +#endif #ifndef OPENSSL_NO_PSK /* the maximum length of the buffer given to callbacks containing the @@ -961,6 +1095,8 @@ const char *SSL_get_psk_identity(const SSL *s); #define SSL_MAC_FLAG_READ_MAC_STREAM 1 #define SSL_MAC_FLAG_WRITE_MAC_STREAM 2 +#ifndef OPENSSL_NO_SSL_INTERN + struct ssl_st { /* protocol version @@ -1005,9 +1141,7 @@ struct ssl_st int server; /* are we the server side? - mostly used by SSL_clear*/ - int new_session;/* 1 if we are to use a new session. - * 2 if we are a server and are inside a handshake - * (i.e. not just sending a HelloRequest) + int new_session;/* Generate a new session or reuse an old one. * NB: For servers, the 'new' session may actually be a previously * cached session or even the previous session unless * SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION is set */ @@ -1177,12 +1311,46 @@ struct ssl_st void *tls_session_secret_cb_arg; SSL_CTX * initial_ctx; /* initial ctx, used to store sessions */ + +#ifndef OPENSSL_NO_NEXTPROTONEG + /* Next protocol negotiation. For the client, this is the protocol that + * we sent in NextProtocol and is set when handling ServerHello + * extensions. + * + * For a server, this is the client's selected_protocol from + * NextProtocol and is set when handling the NextProtocol message, + * before the Finished message. */ + unsigned char *next_proto_negotiated; + unsigned char next_proto_negotiated_len; +#endif + #define session_ctx initial_ctx + + STACK_OF(SRTP_PROTECTION_PROFILE) *srtp_profiles; /* What we'll do */ + SRTP_PROTECTION_PROFILE *srtp_profile; /* What's been chosen */ + + unsigned int tlsext_heartbeat; /* Is use of the Heartbeat extension negotiated? + 0: disabled + 1: enabled + 2: enabled, but not allowed to send Requests + */ + unsigned int tlsext_hb_pending; /* Indicates if a HeartbeatRequest is in flight */ + unsigned int tlsext_hb_seq; /* HeartbeatRequest sequence number */ #else #define session_ctx ctx #endif /* OPENSSL_NO_TLSEXT */ + + int renegotiate;/* 1 if we are renegotiating. + * 2 if we are a server and are inside a handshake + * (i.e. not just sending a HelloRequest) */ + +#ifndef OPENSSL_NO_SRP + SRP_CTX srp_ctx; /* ctx for SRP authentication */ +#endif }; +#endif + #ifdef __cplusplus } #endif @@ -1192,6 +1360,7 @@ struct ssl_st #include /* This is mostly sslv3 with a few tweaks */ #include /* Datagram TLS */ #include +#include /* Support for the use_srtp extension */ #ifdef __cplusplus extern "C" { @@ -1408,6 +1577,20 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION) #define SSL_CTRL_SET_TLSEXT_STATUS_REQ_OCSP_RESP 71 #define SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB 72 + +#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME_CB 75 +#define SSL_CTRL_SET_SRP_VERIFY_PARAM_CB 76 +#define SSL_CTRL_SET_SRP_GIVE_CLIENT_PWD_CB 77 + +#define SSL_CTRL_SET_SRP_ARG 78 +#define SSL_CTRL_SET_TLS_EXT_SRP_USERNAME 79 +#define SSL_CTRL_SET_TLS_EXT_SRP_STRENGTH 80 +#define SSL_CTRL_SET_TLS_EXT_SRP_PASSWORD 81 +#ifndef OPENSSL_NO_HEARTBEATS +#define SSL_CTRL_TLS_EXT_SEND_HEARTBEAT 85 +#define SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING 86 +#define SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS 87 +#endif #endif #define DTLS_CTRL_GET_TIMEOUT 73 @@ -1418,6 +1601,9 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION) #define SSL_CTRL_CLEAR_OPTIONS 77 #define SSL_CTRL_CLEAR_MODE 78 +#define SSL_CTRL_GET_EXTRA_CHAIN_CERTS 82 +#define SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS 83 + #define DTLSv1_get_timeout(ssl, arg) \ SSL_ctrl(ssl,DTLS_CTRL_GET_TIMEOUT,0, (void *)arg) #define DTLSv1_handle_timeout(ssl) \ @@ -1454,6 +1640,10 @@ DECLARE_PEM_rw(SSL_SESSION, SSL_SESSION) #define SSL_CTX_add_extra_chain_cert(ctx,x509) \ SSL_CTX_ctrl(ctx,SSL_CTRL_EXTRA_CHAIN_CERT,0,(char *)x509) +#define SSL_CTX_get_extra_chain_certs(ctx,px509) \ + SSL_CTX_ctrl(ctx,SSL_CTRL_GET_EXTRA_CHAIN_CERTS,0,px509) +#define SSL_CTX_clear_extra_chain_certs(ctx) \ + SSL_CTX_ctrl(ctx,SSL_CTRL_CLEAR_EXTRA_CHAIN_CERTS,0,NULL) #ifndef OPENSSL_NO_BIO BIO_METHOD *BIO_f_ssl(void); @@ -1481,6 +1671,7 @@ const SSL_CIPHER *SSL_get_current_cipher(const SSL *s); int SSL_CIPHER_get_bits(const SSL_CIPHER *c,int *alg_bits); char * SSL_CIPHER_get_version(const SSL_CIPHER *c); const char * SSL_CIPHER_get_name(const SSL_CIPHER *c); +unsigned long SSL_CIPHER_get_id(const SSL_CIPHER *c); int SSL_get_fd(const SSL *s); int SSL_get_rfd(const SSL *s); @@ -1546,10 +1737,14 @@ long SSL_SESSION_set_time(SSL_SESSION *s, long t); long SSL_SESSION_get_timeout(const SSL_SESSION *s); long SSL_SESSION_set_timeout(SSL_SESSION *s, long t); void SSL_copy_session_id(SSL *to,const SSL *from); +X509 *SSL_SESSION_get0_peer(SSL_SESSION *s); +int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx, + unsigned int sid_ctx_len); SSL_SESSION *SSL_SESSION_new(void); const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s, unsigned int *len); +unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s); #ifndef OPENSSL_NO_FP_API int SSL_SESSION_print_fp(FILE *fp,const SSL_SESSION *ses); #endif @@ -1612,6 +1807,30 @@ int SSL_set_trust(SSL *s, int trust); int SSL_CTX_set1_param(SSL_CTX *ctx, X509_VERIFY_PARAM *vpm); int SSL_set1_param(SSL *ssl, X509_VERIFY_PARAM *vpm); +#ifndef OPENSSL_NO_SRP +int SSL_CTX_set_srp_username(SSL_CTX *ctx,char *name); +int SSL_CTX_set_srp_password(SSL_CTX *ctx,char *password); +int SSL_CTX_set_srp_strength(SSL_CTX *ctx, int strength); +int SSL_CTX_set_srp_client_pwd_callback(SSL_CTX *ctx, + char *(*cb)(SSL *,void *)); +int SSL_CTX_set_srp_verify_param_callback(SSL_CTX *ctx, + int (*cb)(SSL *,void *)); +int SSL_CTX_set_srp_username_callback(SSL_CTX *ctx, + int (*cb)(SSL *,int *,void *)); +int SSL_CTX_set_srp_cb_arg(SSL_CTX *ctx, void *arg); + +int SSL_set_srp_server_param(SSL *s, const BIGNUM *N, const BIGNUM *g, + BIGNUM *sa, BIGNUM *v, char *info); +int SSL_set_srp_server_param_pw(SSL *s, const char *user, const char *pass, + const char *grp); + +BIGNUM *SSL_get_srp_g(SSL *s); +BIGNUM *SSL_get_srp_N(SSL *s); + +char *SSL_get_srp_username(SSL *s); +char *SSL_get_srp_userinfo(SSL *s); +#endif + void SSL_free(SSL *ssl); int SSL_accept(SSL *ssl); int SSL_connect(SSL *ssl); @@ -1647,6 +1866,15 @@ const SSL_METHOD *TLSv1_method(void); /* TLSv1.0 */ const SSL_METHOD *TLSv1_server_method(void); /* TLSv1.0 */ const SSL_METHOD *TLSv1_client_method(void); /* TLSv1.0 */ +const SSL_METHOD *TLSv1_1_method(void); /* TLSv1.1 */ +const SSL_METHOD *TLSv1_1_server_method(void); /* TLSv1.1 */ +const SSL_METHOD *TLSv1_1_client_method(void); /* TLSv1.1 */ + +const SSL_METHOD *TLSv1_2_method(void); /* TLSv1.2 */ +const SSL_METHOD *TLSv1_2_server_method(void); /* TLSv1.2 */ +const SSL_METHOD *TLSv1_2_client_method(void); /* TLSv1.2 */ + + const SSL_METHOD *DTLSv1_method(void); /* DTLSv1.0 */ const SSL_METHOD *DTLSv1_server_method(void); /* DTLSv1.0 */ const SSL_METHOD *DTLSv1_client_method(void); /* DTLSv1.0 */ @@ -1655,6 +1883,7 @@ STACK_OF(SSL_CIPHER) *SSL_get_ciphers(const SSL *s); int SSL_do_handshake(SSL *s); int SSL_renegotiate(SSL *s); +int SSL_renegotiate_abbreviated(SSL *s); int SSL_renegotiate_pending(SSL *s); int SSL_shutdown(SSL *s); @@ -1706,6 +1935,7 @@ void SSL_set_info_callback(SSL *ssl, void (*cb)(const SSL *ssl,int type,int val)); void (*SSL_get_info_callback(const SSL *ssl))(const SSL *ssl,int type,int val); int SSL_state(const SSL *ssl); +void SSL_set_state(SSL *ssl, int state); void SSL_set_verify_result(SSL *ssl,long v); long SSL_get_verify_result(const SSL *ssl); @@ -1806,6 +2036,9 @@ int SSL_set_session_ticket_ext_cb(SSL *s, tls_session_ticket_ext_cb_fn cb, /* Pre-shared secret session resumption functions */ int SSL_set_session_secret_cb(SSL *s, tls_session_secret_cb_fn tls_session_secret_cb, void *arg); +void SSL_set_debug(SSL *s, int debug); +int SSL_cache_hit(SSL *s); + /* BEGIN ERROR CODES */ /* The following lines are auto generated by the script mkerr.pl. Any changes * made after this point may be overwritten when the script is next run. @@ -1825,6 +2058,7 @@ void ERR_load_SSL_strings(void); #define SSL_F_DTLS1_ACCEPT 246 #define SSL_F_DTLS1_ADD_CERT_TO_BUF 295 #define SSL_F_DTLS1_BUFFER_RECORD 247 +#define SSL_F_DTLS1_CHECK_TIMEOUT_NUM 316 #define SSL_F_DTLS1_CLIENT_HELLO 248 #define SSL_F_DTLS1_CONNECT 249 #define SSL_F_DTLS1_ENC 250 @@ -1833,6 +2067,7 @@ void ERR_load_SSL_strings(void); #define SSL_F_DTLS1_GET_MESSAGE_FRAGMENT 253 #define SSL_F_DTLS1_GET_RECORD 254 #define SSL_F_DTLS1_HANDLE_TIMEOUT 297 +#define SSL_F_DTLS1_HEARTBEAT 305 #define SSL_F_DTLS1_OUTPUT_CERT_CHAIN 255 #define SSL_F_DTLS1_PREPROCESS_FRAGMENT 288 #define SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE 256 @@ -1901,6 +2136,7 @@ void ERR_load_SSL_strings(void); #define SSL_F_SSL3_GET_KEY_EXCHANGE 141 #define SSL_F_SSL3_GET_MESSAGE 142 #define SSL_F_SSL3_GET_NEW_SESSION_TICKET 283 +#define SSL_F_SSL3_GET_NEXT_PROTO 306 #define SSL_F_SSL3_GET_RECORD 143 #define SSL_F_SSL3_GET_SERVER_CERTIFICATE 144 #define SSL_F_SSL3_GET_SERVER_DONE 145 @@ -1925,10 +2161,12 @@ void ERR_load_SSL_strings(void); #define SSL_F_SSL3_WRITE_PENDING 159 #define SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT 298 #define SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT 277 +#define SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT 307 #define SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK 215 #define SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK 216 #define SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT 299 #define SSL_F_SSL_ADD_SERVERHELLO_TLSEXT 278 +#define SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT 308 #define SSL_F_SSL_BAD_METHOD 160 #define SSL_F_SSL_BYTES_TO_CIPHER_LIST 161 #define SSL_F_SSL_CERT_DUP 221 @@ -1945,6 +2183,7 @@ void ERR_load_SSL_strings(void); #define SSL_F_SSL_CREATE_CIPHER_LIST 166 #define SSL_F_SSL_CTRL 232 #define SSL_F_SSL_CTX_CHECK_PRIVATE_KEY 168 +#define SSL_F_SSL_CTX_MAKE_PROFILES 309 #define SSL_F_SSL_CTX_NEW 169 #define SSL_F_SSL_CTX_SET_CIPHER_LIST 269 #define SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE 290 @@ -1973,8 +2212,10 @@ void ERR_load_SSL_strings(void); #define SSL_F_SSL_NEW 186 #define SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT 300 #define SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT 302 +#define SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT 310 #define SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT 301 #define SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT 303 +#define SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT 311 #define SSL_F_SSL_PEEK 270 #define SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT 281 #define SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT 282 @@ -1983,6 +2224,7 @@ void ERR_load_SSL_strings(void); #define SSL_F_SSL_RSA_PUBLIC_ENCRYPT 188 #define SSL_F_SSL_SESSION_NEW 189 #define SSL_F_SSL_SESSION_PRINT_FP 190 +#define SSL_F_SSL_SESSION_SET1_ID_CONTEXT 312 #define SSL_F_SSL_SESS_CERT_NEW 225 #define SSL_F_SSL_SET_CERT 191 #define SSL_F_SSL_SET_CIPHER_LIST 271 @@ -1996,6 +2238,7 @@ void ERR_load_SSL_strings(void); #define SSL_F_SSL_SET_TRUST 228 #define SSL_F_SSL_SET_WFD 196 #define SSL_F_SSL_SHUTDOWN 224 +#define SSL_F_SSL_SRP_CTX_INIT 313 #define SSL_F_SSL_UNDEFINED_CONST_FUNCTION 243 #define SSL_F_SSL_UNDEFINED_FUNCTION 197 #define SSL_F_SSL_UNDEFINED_VOID_FUNCTION 244 @@ -2015,6 +2258,8 @@ void ERR_load_SSL_strings(void); #define SSL_F_TLS1_CHANGE_CIPHER_STATE 209 #define SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT 274 #define SSL_F_TLS1_ENC 210 +#define SSL_F_TLS1_EXPORT_KEYING_MATERIAL 314 +#define SSL_F_TLS1_HEARTBEAT 315 #define SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT 275 #define SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT 276 #define SSL_F_TLS1_PRF 284 @@ -2054,6 +2299,13 @@ void ERR_load_SSL_strings(void); #define SSL_R_BAD_RSA_MODULUS_LENGTH 121 #define SSL_R_BAD_RSA_SIGNATURE 122 #define SSL_R_BAD_SIGNATURE 123 +#define SSL_R_BAD_SRP_A_LENGTH 347 +#define SSL_R_BAD_SRP_B_LENGTH 348 +#define SSL_R_BAD_SRP_G_LENGTH 349 +#define SSL_R_BAD_SRP_N_LENGTH 350 +#define SSL_R_BAD_SRP_S_LENGTH 351 +#define SSL_R_BAD_SRTP_MKI_VALUE 352 +#define SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST 353 #define SSL_R_BAD_SSL_FILETYPE 124 #define SSL_R_BAD_SSL_SESSION_ID_LENGTH 125 #define SSL_R_BAD_STATE 126 @@ -2092,12 +2344,15 @@ void ERR_load_SSL_strings(void); #define SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE 322 #define SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE 323 #define SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER 310 +#define SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST 354 #define SSL_R_ENCRYPTED_LENGTH_TOO_LONG 150 #define SSL_R_ERROR_GENERATING_TMP_RSA_KEY 282 #define SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST 151 #define SSL_R_EXCESSIVE_MESSAGE_SIZE 152 #define SSL_R_EXTRA_DATA_IN_MESSAGE 153 #define SSL_R_GOT_A_FIN_BEFORE_A_CCS 154 +#define SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS 355 +#define SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION 356 #define SSL_R_HTTPS_PROXY_REQUEST 155 #define SSL_R_HTTP_REQUEST 156 #define SSL_R_ILLEGAL_PADDING 283 @@ -2106,6 +2361,7 @@ void ERR_load_SSL_strings(void); #define SSL_R_INVALID_COMMAND 280 #define SSL_R_INVALID_COMPRESSION_ALGORITHM 341 #define SSL_R_INVALID_PURPOSE 278 +#define SSL_R_INVALID_SRP_USERNAME 357 #define SSL_R_INVALID_STATUS_RESPONSE 328 #define SSL_R_INVALID_TICKET_KEYS_LENGTH 325 #define SSL_R_INVALID_TRUST 279 @@ -2135,6 +2391,7 @@ void ERR_load_SSL_strings(void); #define SSL_R_MISSING_RSA_CERTIFICATE 168 #define SSL_R_MISSING_RSA_ENCRYPTING_CERT 169 #define SSL_R_MISSING_RSA_SIGNING_CERT 170 +#define SSL_R_MISSING_SRP_PARAM 358 #define SSL_R_MISSING_TMP_DH_KEY 171 #define SSL_R_MISSING_TMP_ECDH_KEY 311 #define SSL_R_MISSING_TMP_RSA_KEY 172 @@ -2164,6 +2421,7 @@ void ERR_load_SSL_strings(void); #define SSL_R_NO_RENEGOTIATION 339 #define SSL_R_NO_REQUIRED_DIGEST 324 #define SSL_R_NO_SHARED_CIPHER 193 +#define SSL_R_NO_SRTP_PROFILES 359 #define SSL_R_NO_VERIFY_CALLBACK 194 #define SSL_R_NULL_SSL_CTX 195 #define SSL_R_NULL_SSL_METHOD_PASSED 196 @@ -2207,7 +2465,12 @@ void ERR_load_SSL_strings(void); #define SSL_R_SERVERHELLO_TLSEXT 275 #define SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED 277 #define SSL_R_SHORT_READ 219 +#define SSL_R_SIGNATURE_ALGORITHMS_ERROR 360 #define SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE 220 +#define SSL_R_SRP_A_CALC 361 +#define SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES 362 +#define SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG 363 +#define SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE 364 #define SSL_R_SSL23_DOING_SESSION_ID_REUSE 221 #define SSL_R_SSL2_CONNECTION_ID_TOO_LONG 299 #define SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT 321 @@ -2252,6 +2515,9 @@ void ERR_load_SSL_strings(void); #define SSL_R_TLSV1_UNRECOGNIZED_NAME 1112 #define SSL_R_TLSV1_UNSUPPORTED_EXTENSION 1110 #define SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER 232 +#define SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT 365 +#define SSL_R_TLS_HEARTBEAT_PENDING 366 +#define SSL_R_TLS_ILLEGAL_EXPORTER_LABEL 367 #define SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST 157 #define SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST 233 #define SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG 234 @@ -2273,6 +2539,7 @@ void ERR_load_SSL_strings(void); #define SSL_R_UNKNOWN_CERTIFICATE_TYPE 247 #define SSL_R_UNKNOWN_CIPHER_RETURNED 248 #define SSL_R_UNKNOWN_CIPHER_TYPE 249 +#define SSL_R_UNKNOWN_DIGEST 368 #define SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE 250 #define SSL_R_UNKNOWN_PKEY_TYPE 251 #define SSL_R_UNKNOWN_PROTOCOL 252 @@ -2287,12 +2554,14 @@ void ERR_load_SSL_strings(void); #define SSL_R_UNSUPPORTED_PROTOCOL 258 #define SSL_R_UNSUPPORTED_SSL_VERSION 259 #define SSL_R_UNSUPPORTED_STATUS_TYPE 329 +#define SSL_R_USE_SRTP_NOT_NEGOTIATED 369 #define SSL_R_WRITE_BIO_NOT_SET 260 #define SSL_R_WRONG_CIPHER_RETURNED 261 #define SSL_R_WRONG_MESSAGE_TYPE 262 #define SSL_R_WRONG_NUMBER_OF_KEY_BITS 263 #define SSL_R_WRONG_SIGNATURE_LENGTH 264 #define SSL_R_WRONG_SIGNATURE_SIZE 265 +#define SSL_R_WRONG_SIGNATURE_TYPE 370 #define SSL_R_WRONG_SSL_VERSION 266 #define SSL_R_WRONG_VERSION_NUMBER 267 #define SSL_R_X509_LIB 268 diff --git a/src/lib/libssl/ssl2.h b/src/lib/libssl/ssl2.h index 99a52ea0dd..eb25dcb0bf 100644 --- a/src/lib/libssl/ssl2.h +++ b/src/lib/libssl/ssl2.h @@ -155,6 +155,8 @@ extern "C" { #define CERT char #endif +#ifndef OPENSSL_NO_SSL_INTERN + typedef struct ssl2_state_st { int three_byte_header; @@ -219,6 +221,8 @@ typedef struct ssl2_state_st } tmp; } SSL2_STATE; +#endif + /* SSLv2 */ /* client */ #define SSL2_ST_SEND_CLIENT_HELLO_A (0x10|SSL_ST_CONNECT) diff --git a/src/lib/libssl/ssl3.h b/src/lib/libssl/ssl3.h index 9c2c41287a..112e627de0 100644 --- a/src/lib/libssl/ssl3.h +++ b/src/lib/libssl/ssl3.h @@ -322,6 +322,7 @@ extern "C" { #define SSL3_RT_ALERT 21 #define SSL3_RT_HANDSHAKE 22 #define SSL3_RT_APPLICATION_DATA 23 +#define TLS1_RT_HEARTBEAT 24 #define SSL3_AL_WARNING 1 #define SSL3_AL_FATAL 2 @@ -339,6 +340,11 @@ extern "C" { #define SSL3_AD_CERTIFICATE_UNKNOWN 46 #define SSL3_AD_ILLEGAL_PARAMETER 47 /* fatal */ +#define TLS1_HB_REQUEST 1 +#define TLS1_HB_RESPONSE 2 + +#ifndef OPENSSL_NO_SSL_INTERN + typedef struct ssl3_record_st { /*r */ int type; /* type of record */ @@ -360,6 +366,8 @@ typedef struct ssl3_buffer_st int left; /* how many bytes left */ } SSL3_BUFFER; +#endif + #define SSL3_CT_RSA_SIGN 1 #define SSL3_CT_DSS_SIGN 2 #define SSL3_CT_RSA_FIXED_DH 3 @@ -379,6 +387,7 @@ typedef struct ssl3_buffer_st #define SSL3_FLAGS_POP_BUFFER 0x0004 #define TLS1_FLAGS_TLS_PADDING_BUG 0x0008 #define TLS1_FLAGS_SKIP_CERT_VERIFY 0x0010 +#define TLS1_FLAGS_KEEP_HANDSHAKE 0x0020 /* SSL3_FLAGS_SGC_RESTART_DONE is set when we * restart a handshake because of MS SGC and so prevents us @@ -391,6 +400,8 @@ typedef struct ssl3_buffer_st */ #define SSL3_FLAGS_SGC_RESTART_DONE 0x0040 +#ifndef OPENSSL_NO_SSL_INTERN + typedef struct ssl3_state_st { long flags; @@ -475,7 +486,7 @@ typedef struct ssl3_state_st int finish_md_len; unsigned char peer_finish_md[EVP_MAX_MD_SIZE*2]; int peer_finish_md_len; - + unsigned long message_size; int message_type; @@ -523,13 +534,23 @@ typedef struct ssl3_state_st unsigned char previous_server_finished[EVP_MAX_MD_SIZE]; unsigned char previous_server_finished_len; int send_connection_binding; /* TODOEKR */ + +#ifndef OPENSSL_NO_NEXTPROTONEG + /* Set if we saw the Next Protocol Negotiation extension from our peer. */ + int next_proto_neg_seen; +#endif } SSL3_STATE; +#endif /* SSLv3 */ /*client */ /* extra state */ #define SSL3_ST_CW_FLUSH (0x100|SSL_ST_CONNECT) +#ifndef OPENSSL_NO_SCTP +#define DTLS1_SCTP_ST_CW_WRITE_SOCK (0x310|SSL_ST_CONNECT) +#define DTLS1_SCTP_ST_CR_READ_SOCK (0x320|SSL_ST_CONNECT) +#endif /* write to server */ #define SSL3_ST_CW_CLNT_HELLO_A (0x110|SSL_ST_CONNECT) #define SSL3_ST_CW_CLNT_HELLO_B (0x111|SSL_ST_CONNECT) @@ -557,6 +578,8 @@ typedef struct ssl3_state_st #define SSL3_ST_CW_CERT_VRFY_B (0x191|SSL_ST_CONNECT) #define SSL3_ST_CW_CHANGE_A (0x1A0|SSL_ST_CONNECT) #define SSL3_ST_CW_CHANGE_B (0x1A1|SSL_ST_CONNECT) +#define SSL3_ST_CW_NEXT_PROTO_A (0x200|SSL_ST_CONNECT) +#define SSL3_ST_CW_NEXT_PROTO_B (0x201|SSL_ST_CONNECT) #define SSL3_ST_CW_FINISHED_A (0x1B0|SSL_ST_CONNECT) #define SSL3_ST_CW_FINISHED_B (0x1B1|SSL_ST_CONNECT) /* read from server */ @@ -572,6 +595,10 @@ typedef struct ssl3_state_st /* server */ /* extra state */ #define SSL3_ST_SW_FLUSH (0x100|SSL_ST_ACCEPT) +#ifndef OPENSSL_NO_SCTP +#define DTLS1_SCTP_ST_SW_WRITE_SOCK (0x310|SSL_ST_ACCEPT) +#define DTLS1_SCTP_ST_SR_READ_SOCK (0x320|SSL_ST_ACCEPT) +#endif /* read from client */ /* Do not change the number values, they do matter */ #define SSL3_ST_SR_CLNT_HELLO_A (0x110|SSL_ST_ACCEPT) @@ -602,6 +629,8 @@ typedef struct ssl3_state_st #define SSL3_ST_SR_CERT_VRFY_B (0x1A1|SSL_ST_ACCEPT) #define SSL3_ST_SR_CHANGE_A (0x1B0|SSL_ST_ACCEPT) #define SSL3_ST_SR_CHANGE_B (0x1B1|SSL_ST_ACCEPT) +#define SSL3_ST_SR_NEXT_PROTO_A (0x210|SSL_ST_ACCEPT) +#define SSL3_ST_SR_NEXT_PROTO_B (0x211|SSL_ST_ACCEPT) #define SSL3_ST_SR_FINISHED_A (0x1C0|SSL_ST_ACCEPT) #define SSL3_ST_SR_FINISHED_B (0x1C1|SSL_ST_ACCEPT) /* write to client */ @@ -626,6 +655,7 @@ typedef struct ssl3_state_st #define SSL3_MT_CLIENT_KEY_EXCHANGE 16 #define SSL3_MT_FINISHED 20 #define SSL3_MT_CERTIFICATE_STATUS 22 +#define SSL3_MT_NEXT_PROTO 67 #define DTLS1_MT_HELLO_VERIFY_REQUEST 3 diff --git a/src/lib/libssl/ssl_algs.c b/src/lib/libssl/ssl_algs.c index 0967b2dfe4..d443143c59 100644 --- a/src/lib/libssl/ssl_algs.c +++ b/src/lib/libssl/ssl_algs.c @@ -73,6 +73,9 @@ int SSL_library_init(void) #endif #ifndef OPENSSL_NO_RC4 EVP_add_cipher(EVP_rc4()); +#if !defined(OPENSSL_NO_MD5) && (defined(__x86_64) || defined(__x86_64__)) + EVP_add_cipher(EVP_rc4_hmac_md5()); +#endif #endif #ifndef OPENSSL_NO_RC2 EVP_add_cipher(EVP_rc2_cbc()); @@ -85,6 +88,12 @@ int SSL_library_init(void) EVP_add_cipher(EVP_aes_128_cbc()); EVP_add_cipher(EVP_aes_192_cbc()); EVP_add_cipher(EVP_aes_256_cbc()); + EVP_add_cipher(EVP_aes_128_gcm()); + EVP_add_cipher(EVP_aes_256_gcm()); +#if !defined(OPENSSL_NO_SHA) && !defined(OPENSSL_NO_SHA1) + EVP_add_cipher(EVP_aes_128_cbc_hmac_sha1()); + EVP_add_cipher(EVP_aes_256_cbc_hmac_sha1()); +#endif #endif #ifndef OPENSSL_NO_CAMELLIA EVP_add_cipher(EVP_camellia_128_cbc()); diff --git a/src/lib/libssl/ssl_asn1.c b/src/lib/libssl/ssl_asn1.c index d7f4c6087e..38540be1e5 100644 --- a/src/lib/libssl/ssl_asn1.c +++ b/src/lib/libssl/ssl_asn1.c @@ -114,6 +114,9 @@ typedef struct ssl_session_asn1_st ASN1_OCTET_STRING psk_identity_hint; ASN1_OCTET_STRING psk_identity; #endif /* OPENSSL_NO_PSK */ +#ifndef OPENSSL_NO_SRP + ASN1_OCTET_STRING srp_username; +#endif /* OPENSSL_NO_SRP */ } SSL_SESSION_ASN1; int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp) @@ -129,6 +132,9 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp) #ifndef OPENSSL_NO_COMP unsigned char cbuf; int v11=0; +#endif +#ifndef OPENSSL_NO_SRP + int v12=0; #endif long l; SSL_SESSION_ASN1 a; @@ -267,6 +273,14 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp) a.psk_identity.data=(unsigned char *)(in->psk_identity); } #endif /* OPENSSL_NO_PSK */ +#ifndef OPENSSL_NO_SRP + if (in->srp_username) + { + a.srp_username.length=strlen(in->srp_username); + a.srp_username.type=V_ASN1_OCTET_STRING; + a.srp_username.data=(unsigned char *)(in->srp_username); + } +#endif /* OPENSSL_NO_SRP */ M_ASN1_I2D_len(&(a.version), i2d_ASN1_INTEGER); M_ASN1_I2D_len(&(a.ssl_version), i2d_ASN1_INTEGER); @@ -307,6 +321,10 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp) if (in->psk_identity) M_ASN1_I2D_len_EXP_opt(&(a.psk_identity), i2d_ASN1_OCTET_STRING,8,v8); #endif /* OPENSSL_NO_PSK */ +#ifndef OPENSSL_NO_SRP + if (in->srp_username) + M_ASN1_I2D_len_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12); +#endif /* OPENSSL_NO_SRP */ M_ASN1_I2D_seq_total(); @@ -351,6 +369,10 @@ int i2d_SSL_SESSION(SSL_SESSION *in, unsigned char **pp) if (in->compress_meth) M_ASN1_I2D_put_EXP_opt(&(a.comp_id), i2d_ASN1_OCTET_STRING,11,v11); #endif +#ifndef OPENSSL_NO_SRP + if (in->srp_username) + M_ASN1_I2D_put_EXP_opt(&(a.srp_username), i2d_ASN1_OCTET_STRING,12,v12); +#endif /* OPENSSL_NO_SRP */ M_ASN1_I2D_finish(); } @@ -549,6 +571,19 @@ SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp, } else ret->psk_identity_hint=NULL; + + os.length=0; + os.data=NULL; + M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,8); + if (os.data) + { + ret->psk_identity = BUF_strndup((char *)os.data, os.length); + OPENSSL_free(os.data); + os.data = NULL; + os.length = 0; + } + else + ret->psk_identity=NULL; #endif /* OPENSSL_NO_PSK */ #ifndef OPENSSL_NO_TLSEXT @@ -588,5 +623,20 @@ SSL_SESSION *d2i_SSL_SESSION(SSL_SESSION **a, const unsigned char **pp, } #endif +#ifndef OPENSSL_NO_SRP + os.length=0; + os.data=NULL; + M_ASN1_D2I_get_EXP_opt(osp,d2i_ASN1_OCTET_STRING,12); + if (os.data) + { + ret->srp_username = BUF_strndup((char *)os.data, os.length); + OPENSSL_free(os.data); + os.data = NULL; + os.length = 0; + } + else + ret->srp_username=NULL; +#endif /* OPENSSL_NO_SRP */ + M_ASN1_D2I_Finish(a,SSL_SESSION_free,SSL_F_D2I_SSL_SESSION); } diff --git a/src/lib/libssl/ssl_cert.c b/src/lib/libssl/ssl_cert.c index 27256eea81..917be31876 100644 --- a/src/lib/libssl/ssl_cert.c +++ b/src/lib/libssl/ssl_cert.c @@ -160,6 +160,21 @@ int SSL_get_ex_data_X509_STORE_CTX_idx(void) return ssl_x509_store_ctx_idx; } +static void ssl_cert_set_default_md(CERT *cert) + { + /* Set digest values to defaults */ +#ifndef OPENSSL_NO_DSA + cert->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_dss1(); +#endif +#ifndef OPENSSL_NO_RSA + cert->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1(); + cert->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1(); +#endif +#ifndef OPENSSL_NO_ECDSA + cert->pkeys[SSL_PKEY_ECC].digest = EVP_ecdsa(); +#endif + } + CERT *ssl_cert_new(void) { CERT *ret; @@ -174,7 +189,7 @@ CERT *ssl_cert_new(void) ret->key= &(ret->pkeys[SSL_PKEY_RSA_ENC]); ret->references=1; - + ssl_cert_set_default_md(ret); return(ret); } @@ -307,6 +322,10 @@ CERT *ssl_cert_dup(CERT *cert) * chain is held inside SSL_CTX */ ret->references=1; + /* Set digests to defaults. NB: we don't copy existing values as they + * will be set during handshake. + */ + ssl_cert_set_default_md(ret); return(ret); diff --git a/src/lib/libssl/ssl_ciph.c b/src/lib/libssl/ssl_ciph.c index 54ba7ef5b4..92d1e94d6a 100644 --- a/src/lib/libssl/ssl_ciph.c +++ b/src/lib/libssl/ssl_ciph.c @@ -162,11 +162,13 @@ #define SSL_ENC_CAMELLIA256_IDX 9 #define SSL_ENC_GOST89_IDX 10 #define SSL_ENC_SEED_IDX 11 -#define SSL_ENC_NUM_IDX 12 +#define SSL_ENC_AES128GCM_IDX 12 +#define SSL_ENC_AES256GCM_IDX 13 +#define SSL_ENC_NUM_IDX 14 static const EVP_CIPHER *ssl_cipher_methods[SSL_ENC_NUM_IDX]={ - NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL, + NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL }; #define SSL_COMP_NULL_IDX 0 @@ -179,28 +181,32 @@ static STACK_OF(SSL_COMP) *ssl_comp_methods=NULL; #define SSL_MD_SHA1_IDX 1 #define SSL_MD_GOST94_IDX 2 #define SSL_MD_GOST89MAC_IDX 3 +#define SSL_MD_SHA256_IDX 4 +#define SSL_MD_SHA384_IDX 5 /*Constant SSL_MAX_DIGEST equal to size of digests array should be * defined in the * ssl_locl.h */ #define SSL_MD_NUM_IDX SSL_MAX_DIGEST static const EVP_MD *ssl_digest_methods[SSL_MD_NUM_IDX]={ - NULL,NULL,NULL,NULL + NULL,NULL,NULL,NULL,NULL,NULL }; /* PKEY_TYPE for GOST89MAC is known in advance, but, because * implementation is engine-provided, we'll fill it only if * corresponding EVP_PKEY_METHOD is found */ static int ssl_mac_pkey_id[SSL_MD_NUM_IDX]={ - EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef + EVP_PKEY_HMAC,EVP_PKEY_HMAC,EVP_PKEY_HMAC,NID_undef, + EVP_PKEY_HMAC,EVP_PKEY_HMAC }; static int ssl_mac_secret_size[SSL_MD_NUM_IDX]={ - 0,0,0,0 + 0,0,0,0,0,0 }; static int ssl_handshake_digest_flag[SSL_MD_NUM_IDX]={ SSL_HANDSHAKE_MAC_MD5,SSL_HANDSHAKE_MAC_SHA, - SSL_HANDSHAKE_MAC_GOST94,0 + SSL_HANDSHAKE_MAC_GOST94, 0, SSL_HANDSHAKE_MAC_SHA256, + SSL_HANDSHAKE_MAC_SHA384 }; #define CIPHER_ADD 1 @@ -247,6 +253,7 @@ static const SSL_CIPHER cipher_aliases[]={ {0,SSL_TXT_ECDH,0, SSL_kECDHr|SSL_kECDHe|SSL_kEECDH,0,0,0,0,0,0,0,0}, {0,SSL_TXT_kPSK,0, SSL_kPSK, 0,0,0,0,0,0,0,0}, + {0,SSL_TXT_kSRP,0, SSL_kSRP, 0,0,0,0,0,0,0,0}, {0,SSL_TXT_kGOST,0, SSL_kGOST,0,0,0,0,0,0,0,0}, /* server authentication aliases */ @@ -273,6 +280,7 @@ static const SSL_CIPHER cipher_aliases[]={ {0,SSL_TXT_ADH,0, SSL_kEDH,SSL_aNULL,0,0,0,0,0,0,0}, {0,SSL_TXT_AECDH,0, SSL_kEECDH,SSL_aNULL,0,0,0,0,0,0,0}, {0,SSL_TXT_PSK,0, SSL_kPSK,SSL_aPSK,0,0,0,0,0,0,0}, + {0,SSL_TXT_SRP,0, SSL_kSRP,0,0,0,0,0,0,0,0}, /* symmetric encryption aliases */ @@ -283,9 +291,10 @@ static const SSL_CIPHER cipher_aliases[]={ {0,SSL_TXT_IDEA,0, 0,0,SSL_IDEA, 0,0,0,0,0,0}, {0,SSL_TXT_SEED,0, 0,0,SSL_SEED, 0,0,0,0,0,0}, {0,SSL_TXT_eNULL,0, 0,0,SSL_eNULL, 0,0,0,0,0,0}, - {0,SSL_TXT_AES128,0, 0,0,SSL_AES128,0,0,0,0,0,0}, - {0,SSL_TXT_AES256,0, 0,0,SSL_AES256,0,0,0,0,0,0}, - {0,SSL_TXT_AES,0, 0,0,SSL_AES128|SSL_AES256,0,0,0,0,0,0}, + {0,SSL_TXT_AES128,0, 0,0,SSL_AES128|SSL_AES128GCM,0,0,0,0,0,0}, + {0,SSL_TXT_AES256,0, 0,0,SSL_AES256|SSL_AES256GCM,0,0,0,0,0,0}, + {0,SSL_TXT_AES,0, 0,0,SSL_AES,0,0,0,0,0,0}, + {0,SSL_TXT_AES_GCM,0, 0,0,SSL_AES128GCM|SSL_AES256GCM,0,0,0,0,0,0}, {0,SSL_TXT_CAMELLIA128,0,0,0,SSL_CAMELLIA128,0,0,0,0,0,0}, {0,SSL_TXT_CAMELLIA256,0,0,0,SSL_CAMELLIA256,0,0,0,0,0,0}, {0,SSL_TXT_CAMELLIA ,0,0,0,SSL_CAMELLIA128|SSL_CAMELLIA256,0,0,0,0,0,0}, @@ -296,6 +305,8 @@ static const SSL_CIPHER cipher_aliases[]={ {0,SSL_TXT_SHA,0, 0,0,0,SSL_SHA1, 0,0,0,0,0}, {0,SSL_TXT_GOST94,0, 0,0,0,SSL_GOST94, 0,0,0,0,0}, {0,SSL_TXT_GOST89MAC,0, 0,0,0,SSL_GOST89MAC, 0,0,0,0,0}, + {0,SSL_TXT_SHA256,0, 0,0,0,SSL_SHA256, 0,0,0,0,0}, + {0,SSL_TXT_SHA384,0, 0,0,0,SSL_SHA384, 0,0,0,0,0}, /* protocol version aliases */ {0,SSL_TXT_SSLV2,0, 0,0,0,0,SSL_SSLV2, 0,0,0,0}, @@ -379,6 +390,11 @@ void ssl_load_ciphers(void) ssl_cipher_methods[SSL_ENC_SEED_IDX]= EVP_get_cipherbyname(SN_seed_cbc); + ssl_cipher_methods[SSL_ENC_AES128GCM_IDX]= + EVP_get_cipherbyname(SN_aes_128_gcm); + ssl_cipher_methods[SSL_ENC_AES256GCM_IDX]= + EVP_get_cipherbyname(SN_aes_256_gcm); + ssl_digest_methods[SSL_MD_MD5_IDX]= EVP_get_digestbyname(SN_md5); ssl_mac_secret_size[SSL_MD_MD5_IDX]= @@ -404,6 +420,14 @@ void ssl_load_ciphers(void) ssl_mac_secret_size[SSL_MD_GOST89MAC_IDX]=32; } + ssl_digest_methods[SSL_MD_SHA256_IDX]= + EVP_get_digestbyname(SN_sha256); + ssl_mac_secret_size[SSL_MD_SHA256_IDX]= + EVP_MD_size(ssl_digest_methods[SSL_MD_SHA256_IDX]); + ssl_digest_methods[SSL_MD_SHA384_IDX]= + EVP_get_digestbyname(SN_sha384); + ssl_mac_secret_size[SSL_MD_SHA384_IDX]= + EVP_MD_size(ssl_digest_methods[SSL_MD_SHA384_IDX]); } #ifndef OPENSSL_NO_COMP @@ -526,6 +550,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc, case SSL_SEED: i=SSL_ENC_SEED_IDX; break; + case SSL_AES128GCM: + i=SSL_ENC_AES128GCM_IDX; + break; + case SSL_AES256GCM: + i=SSL_ENC_AES256GCM_IDX; + break; default: i= -1; break; @@ -549,6 +579,12 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc, case SSL_SHA1: i=SSL_MD_SHA1_IDX; break; + case SSL_SHA256: + i=SSL_MD_SHA256_IDX; + break; + case SSL_SHA384: + i=SSL_MD_SHA384_IDX; + break; case SSL_GOST94: i = SSL_MD_GOST94_IDX; break; @@ -564,17 +600,45 @@ int ssl_cipher_get_evp(const SSL_SESSION *s, const EVP_CIPHER **enc, *md=NULL; if (mac_pkey_type!=NULL) *mac_pkey_type = NID_undef; if (mac_secret_size!=NULL) *mac_secret_size = 0; - + if (c->algorithm_mac == SSL_AEAD) + mac_pkey_type = NULL; } else { *md=ssl_digest_methods[i]; if (mac_pkey_type!=NULL) *mac_pkey_type = ssl_mac_pkey_id[i]; if (mac_secret_size!=NULL) *mac_secret_size = ssl_mac_secret_size[i]; - } + } + + if ((*enc != NULL) && + (*md != NULL || (EVP_CIPHER_flags(*enc)&EVP_CIPH_FLAG_AEAD_CIPHER)) && + (!mac_pkey_type||*mac_pkey_type != NID_undef)) + { + const EVP_CIPHER *evp; + + if (s->ssl_version>>8 != TLS1_VERSION_MAJOR || + s->ssl_version < TLS1_VERSION) + return 1; - if ((*enc != NULL) && (*md != NULL) && (!mac_pkey_type||*mac_pkey_type != NID_undef)) +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return 1; +#endif + + if (c->algorithm_enc == SSL_RC4 && + c->algorithm_mac == SSL_MD5 && + (evp=EVP_get_cipherbyname("RC4-HMAC-MD5"))) + *enc = evp, *md = NULL; + else if (c->algorithm_enc == SSL_AES128 && + c->algorithm_mac == SSL_SHA1 && + (evp=EVP_get_cipherbyname("AES-128-CBC-HMAC-SHA1"))) + *enc = evp, *md = NULL; + else if (c->algorithm_enc == SSL_AES256 && + c->algorithm_mac == SSL_SHA1 && + (evp=EVP_get_cipherbyname("AES-256-CBC-HMAC-SHA1"))) + *enc = evp, *md = NULL; return(1); + } else return(0); } @@ -585,9 +649,11 @@ int ssl_get_handshake_digest(int idx, long *mask, const EVP_MD **md) { return 0; } - if (ssl_handshake_digest_flag[idx]==0) return 0; *mask = ssl_handshake_digest_flag[idx]; - *md = ssl_digest_methods[idx]; + if (*mask) + *md = ssl_digest_methods[idx]; + else + *md = NULL; return 1; } @@ -661,6 +727,9 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un #ifdef OPENSSL_NO_PSK *mkey |= SSL_kPSK; *auth |= SSL_aPSK; +#endif +#ifdef OPENSSL_NO_SRP + *mkey |= SSL_kSRP; #endif /* Check for presence of GOST 34.10 algorithms, and if they * do not present, disable appropriate auth and key exchange */ @@ -687,6 +756,8 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un *enc |= (ssl_cipher_methods[SSL_ENC_IDEA_IDX] == NULL) ? SSL_IDEA:0; *enc |= (ssl_cipher_methods[SSL_ENC_AES128_IDX] == NULL) ? SSL_AES128:0; *enc |= (ssl_cipher_methods[SSL_ENC_AES256_IDX] == NULL) ? SSL_AES256:0; + *enc |= (ssl_cipher_methods[SSL_ENC_AES128GCM_IDX] == NULL) ? SSL_AES128GCM:0; + *enc |= (ssl_cipher_methods[SSL_ENC_AES256GCM_IDX] == NULL) ? SSL_AES256GCM:0; *enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA128_IDX] == NULL) ? SSL_CAMELLIA128:0; *enc |= (ssl_cipher_methods[SSL_ENC_CAMELLIA256_IDX] == NULL) ? SSL_CAMELLIA256:0; *enc |= (ssl_cipher_methods[SSL_ENC_GOST89_IDX] == NULL) ? SSL_eGOST2814789CNT:0; @@ -694,6 +765,8 @@ static void ssl_cipher_get_disabled(unsigned long *mkey, unsigned long *auth, un *mac |= (ssl_digest_methods[SSL_MD_MD5_IDX ] == NULL) ? SSL_MD5 :0; *mac |= (ssl_digest_methods[SSL_MD_SHA1_IDX] == NULL) ? SSL_SHA1:0; + *mac |= (ssl_digest_methods[SSL_MD_SHA256_IDX] == NULL) ? SSL_SHA256:0; + *mac |= (ssl_digest_methods[SSL_MD_SHA384_IDX] == NULL) ? SSL_SHA384:0; *mac |= (ssl_digest_methods[SSL_MD_GOST94_IDX] == NULL) ? SSL_GOST94:0; *mac |= (ssl_digest_methods[SSL_MD_GOST89MAC_IDX] == NULL || ssl_mac_pkey_id[SSL_MD_GOST89MAC_IDX]==NID_undef)? SSL_GOST89MAC:0; @@ -724,6 +797,9 @@ static void ssl_cipher_collect_ciphers(const SSL_METHOD *ssl_method, c = ssl_method->get_cipher(i); /* drop those that use any of that is not available */ if ((c != NULL) && c->valid && +#ifdef OPENSSL_FIPS + (!FIPS_mode() || (c->algo_strength & SSL_FIPS)) && +#endif !(c->algorithm_mkey & disabled_mkey) && !(c->algorithm_auth & disabled_auth) && !(c->algorithm_enc & disabled_enc) && @@ -1423,7 +1499,11 @@ STACK_OF(SSL_CIPHER) *ssl_create_cipher_list(const SSL_METHOD *ssl_method, */ for (curr = head; curr != NULL; curr = curr->next) { +#ifdef OPENSSL_FIPS + if (curr->active && (!FIPS_mode() || curr->cipher->algo_strength & SSL_FIPS)) +#else if (curr->active) +#endif { sk_SSL_CIPHER_push(cipherstack, curr->cipher); #ifdef CIPHER_DEBUG @@ -1480,6 +1560,8 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) ver="SSLv2"; else if (alg_ssl & SSL_SSLV3) ver="SSLv3"; + else if (alg_ssl & SSL_TLSV1_2) + ver="TLSv1.2"; else ver="unknown"; @@ -1512,6 +1594,9 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) case SSL_kPSK: kx="PSK"; break; + case SSL_kSRP: + kx="SRP"; + break; default: kx="unknown"; } @@ -1574,6 +1659,12 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) case SSL_AES256: enc="AES(256)"; break; + case SSL_AES128GCM: + enc="AESGCM(128)"; + break; + case SSL_AES256GCM: + enc="AESGCM(256)"; + break; case SSL_CAMELLIA128: enc="Camellia(128)"; break; @@ -1596,6 +1687,15 @@ char *SSL_CIPHER_description(const SSL_CIPHER *cipher, char *buf, int len) case SSL_SHA1: mac="SHA1"; break; + case SSL_SHA256: + mac="SHA256"; + break; + case SSL_SHA384: + mac="SHA384"; + break; + case SSL_AEAD: + mac="AEAD"; + break; default: mac="unknown"; break; @@ -1653,6 +1753,11 @@ int SSL_CIPHER_get_bits(const SSL_CIPHER *c, int *alg_bits) return(ret); } +unsigned long SSL_CIPHER_get_id(const SSL_CIPHER *c) + { + return c->id; + } + SSL_COMP *ssl3_comp_find(STACK_OF(SSL_COMP) *sk, int n) { SSL_COMP *ctmp; diff --git a/src/lib/libssl/ssl_err.c b/src/lib/libssl/ssl_err.c index e9be77109f..2577c6895a 100644 --- a/src/lib/libssl/ssl_err.c +++ b/src/lib/libssl/ssl_err.c @@ -80,6 +80,7 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_DTLS1_ACCEPT), "DTLS1_ACCEPT"}, {ERR_FUNC(SSL_F_DTLS1_ADD_CERT_TO_BUF), "DTLS1_ADD_CERT_TO_BUF"}, {ERR_FUNC(SSL_F_DTLS1_BUFFER_RECORD), "DTLS1_BUFFER_RECORD"}, +{ERR_FUNC(SSL_F_DTLS1_CHECK_TIMEOUT_NUM), "DTLS1_CHECK_TIMEOUT_NUM"}, {ERR_FUNC(SSL_F_DTLS1_CLIENT_HELLO), "DTLS1_CLIENT_HELLO"}, {ERR_FUNC(SSL_F_DTLS1_CONNECT), "DTLS1_CONNECT"}, {ERR_FUNC(SSL_F_DTLS1_ENC), "DTLS1_ENC"}, @@ -88,6 +89,7 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_DTLS1_GET_MESSAGE_FRAGMENT), "DTLS1_GET_MESSAGE_FRAGMENT"}, {ERR_FUNC(SSL_F_DTLS1_GET_RECORD), "DTLS1_GET_RECORD"}, {ERR_FUNC(SSL_F_DTLS1_HANDLE_TIMEOUT), "DTLS1_HANDLE_TIMEOUT"}, +{ERR_FUNC(SSL_F_DTLS1_HEARTBEAT), "DTLS1_HEARTBEAT"}, {ERR_FUNC(SSL_F_DTLS1_OUTPUT_CERT_CHAIN), "DTLS1_OUTPUT_CERT_CHAIN"}, {ERR_FUNC(SSL_F_DTLS1_PREPROCESS_FRAGMENT), "DTLS1_PREPROCESS_FRAGMENT"}, {ERR_FUNC(SSL_F_DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE), "DTLS1_PROCESS_OUT_OF_SEQ_MESSAGE"}, @@ -156,6 +158,7 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_SSL3_GET_KEY_EXCHANGE), "SSL3_GET_KEY_EXCHANGE"}, {ERR_FUNC(SSL_F_SSL3_GET_MESSAGE), "SSL3_GET_MESSAGE"}, {ERR_FUNC(SSL_F_SSL3_GET_NEW_SESSION_TICKET), "SSL3_GET_NEW_SESSION_TICKET"}, +{ERR_FUNC(SSL_F_SSL3_GET_NEXT_PROTO), "SSL3_GET_NEXT_PROTO"}, {ERR_FUNC(SSL_F_SSL3_GET_RECORD), "SSL3_GET_RECORD"}, {ERR_FUNC(SSL_F_SSL3_GET_SERVER_CERTIFICATE), "SSL3_GET_SERVER_CERTIFICATE"}, {ERR_FUNC(SSL_F_SSL3_GET_SERVER_DONE), "SSL3_GET_SERVER_DONE"}, @@ -180,10 +183,12 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_SSL3_WRITE_PENDING), "SSL3_WRITE_PENDING"}, {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT), "SSL_ADD_CLIENTHELLO_RENEGOTIATE_EXT"}, {ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT), "SSL_ADD_CLIENTHELLO_TLSEXT"}, +{ERR_FUNC(SSL_F_SSL_ADD_CLIENTHELLO_USE_SRTP_EXT), "SSL_ADD_CLIENTHELLO_USE_SRTP_EXT"}, {ERR_FUNC(SSL_F_SSL_ADD_DIR_CERT_SUBJECTS_TO_STACK), "SSL_add_dir_cert_subjects_to_stack"}, {ERR_FUNC(SSL_F_SSL_ADD_FILE_CERT_SUBJECTS_TO_STACK), "SSL_add_file_cert_subjects_to_stack"}, {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT), "SSL_ADD_SERVERHELLO_RENEGOTIATE_EXT"}, {ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT), "SSL_ADD_SERVERHELLO_TLSEXT"}, +{ERR_FUNC(SSL_F_SSL_ADD_SERVERHELLO_USE_SRTP_EXT), "SSL_ADD_SERVERHELLO_USE_SRTP_EXT"}, {ERR_FUNC(SSL_F_SSL_BAD_METHOD), "SSL_BAD_METHOD"}, {ERR_FUNC(SSL_F_SSL_BYTES_TO_CIPHER_LIST), "SSL_BYTES_TO_CIPHER_LIST"}, {ERR_FUNC(SSL_F_SSL_CERT_DUP), "SSL_CERT_DUP"}, @@ -200,6 +205,7 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_SSL_CREATE_CIPHER_LIST), "SSL_CREATE_CIPHER_LIST"}, {ERR_FUNC(SSL_F_SSL_CTRL), "SSL_ctrl"}, {ERR_FUNC(SSL_F_SSL_CTX_CHECK_PRIVATE_KEY), "SSL_CTX_check_private_key"}, +{ERR_FUNC(SSL_F_SSL_CTX_MAKE_PROFILES), "SSL_CTX_MAKE_PROFILES"}, {ERR_FUNC(SSL_F_SSL_CTX_NEW), "SSL_CTX_new"}, {ERR_FUNC(SSL_F_SSL_CTX_SET_CIPHER_LIST), "SSL_CTX_set_cipher_list"}, {ERR_FUNC(SSL_F_SSL_CTX_SET_CLIENT_CERT_ENGINE), "SSL_CTX_set_client_cert_engine"}, @@ -228,8 +234,10 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_SSL_NEW), "SSL_new"}, {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT), "SSL_PARSE_CLIENTHELLO_RENEGOTIATE_EXT"}, {ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_TLSEXT), "SSL_PARSE_CLIENTHELLO_TLSEXT"}, +{ERR_FUNC(SSL_F_SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT), "SSL_PARSE_CLIENTHELLO_USE_SRTP_EXT"}, {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT), "SSL_PARSE_SERVERHELLO_RENEGOTIATE_EXT"}, {ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_TLSEXT), "SSL_PARSE_SERVERHELLO_TLSEXT"}, +{ERR_FUNC(SSL_F_SSL_PARSE_SERVERHELLO_USE_SRTP_EXT), "SSL_PARSE_SERVERHELLO_USE_SRTP_EXT"}, {ERR_FUNC(SSL_F_SSL_PEEK), "SSL_peek"}, {ERR_FUNC(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT), "SSL_PREPARE_CLIENTHELLO_TLSEXT"}, {ERR_FUNC(SSL_F_SSL_PREPARE_SERVERHELLO_TLSEXT), "SSL_PREPARE_SERVERHELLO_TLSEXT"}, @@ -238,6 +246,7 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_SSL_RSA_PUBLIC_ENCRYPT), "SSL_RSA_PUBLIC_ENCRYPT"}, {ERR_FUNC(SSL_F_SSL_SESSION_NEW), "SSL_SESSION_new"}, {ERR_FUNC(SSL_F_SSL_SESSION_PRINT_FP), "SSL_SESSION_print_fp"}, +{ERR_FUNC(SSL_F_SSL_SESSION_SET1_ID_CONTEXT), "SSL_SESSION_set1_id_context"}, {ERR_FUNC(SSL_F_SSL_SESS_CERT_NEW), "SSL_SESS_CERT_NEW"}, {ERR_FUNC(SSL_F_SSL_SET_CERT), "SSL_SET_CERT"}, {ERR_FUNC(SSL_F_SSL_SET_CIPHER_LIST), "SSL_set_cipher_list"}, @@ -251,6 +260,7 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_SSL_SET_TRUST), "SSL_set_trust"}, {ERR_FUNC(SSL_F_SSL_SET_WFD), "SSL_set_wfd"}, {ERR_FUNC(SSL_F_SSL_SHUTDOWN), "SSL_shutdown"}, +{ERR_FUNC(SSL_F_SSL_SRP_CTX_INIT), "SSL_SRP_CTX_init"}, {ERR_FUNC(SSL_F_SSL_UNDEFINED_CONST_FUNCTION), "SSL_UNDEFINED_CONST_FUNCTION"}, {ERR_FUNC(SSL_F_SSL_UNDEFINED_FUNCTION), "SSL_UNDEFINED_FUNCTION"}, {ERR_FUNC(SSL_F_SSL_UNDEFINED_VOID_FUNCTION), "SSL_UNDEFINED_VOID_FUNCTION"}, @@ -270,6 +280,8 @@ static ERR_STRING_DATA SSL_str_functs[]= {ERR_FUNC(SSL_F_TLS1_CHANGE_CIPHER_STATE), "TLS1_CHANGE_CIPHER_STATE"}, {ERR_FUNC(SSL_F_TLS1_CHECK_SERVERHELLO_TLSEXT), "TLS1_CHECK_SERVERHELLO_TLSEXT"}, {ERR_FUNC(SSL_F_TLS1_ENC), "TLS1_ENC"}, +{ERR_FUNC(SSL_F_TLS1_EXPORT_KEYING_MATERIAL), "TLS1_EXPORT_KEYING_MATERIAL"}, +{ERR_FUNC(SSL_F_TLS1_HEARTBEAT), "SSL_F_TLS1_HEARTBEAT"}, {ERR_FUNC(SSL_F_TLS1_PREPARE_CLIENTHELLO_TLSEXT), "TLS1_PREPARE_CLIENTHELLO_TLSEXT"}, {ERR_FUNC(SSL_F_TLS1_PREPARE_SERVERHELLO_TLSEXT), "TLS1_PREPARE_SERVERHELLO_TLSEXT"}, {ERR_FUNC(SSL_F_TLS1_PRF), "tls1_prf"}, @@ -312,6 +324,13 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_BAD_RSA_MODULUS_LENGTH),"bad rsa modulus length"}, {ERR_REASON(SSL_R_BAD_RSA_SIGNATURE) ,"bad rsa signature"}, {ERR_REASON(SSL_R_BAD_SIGNATURE) ,"bad signature"}, +{ERR_REASON(SSL_R_BAD_SRP_A_LENGTH) ,"bad srp a length"}, +{ERR_REASON(SSL_R_BAD_SRP_B_LENGTH) ,"bad srp b length"}, +{ERR_REASON(SSL_R_BAD_SRP_G_LENGTH) ,"bad srp g length"}, +{ERR_REASON(SSL_R_BAD_SRP_N_LENGTH) ,"bad srp n length"}, +{ERR_REASON(SSL_R_BAD_SRP_S_LENGTH) ,"bad srp s length"}, +{ERR_REASON(SSL_R_BAD_SRTP_MKI_VALUE) ,"bad srtp mki value"}, +{ERR_REASON(SSL_R_BAD_SRTP_PROTECTION_PROFILE_LIST),"bad srtp protection profile list"}, {ERR_REASON(SSL_R_BAD_SSL_FILETYPE) ,"bad ssl filetype"}, {ERR_REASON(SSL_R_BAD_SSL_SESSION_ID_LENGTH),"bad ssl session id length"}, {ERR_REASON(SSL_R_BAD_STATE) ,"bad state"}, @@ -350,12 +369,15 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_RSA_SIGNATURE),"ecc cert should have rsa signature"}, {ERR_REASON(SSL_R_ECC_CERT_SHOULD_HAVE_SHA1_SIGNATURE),"ecc cert should have sha1 signature"}, {ERR_REASON(SSL_R_ECGROUP_TOO_LARGE_FOR_CIPHER),"ecgroup too large for cipher"}, +{ERR_REASON(SSL_R_EMPTY_SRTP_PROTECTION_PROFILE_LIST),"empty srtp protection profile list"}, {ERR_REASON(SSL_R_ENCRYPTED_LENGTH_TOO_LONG),"encrypted length too long"}, {ERR_REASON(SSL_R_ERROR_GENERATING_TMP_RSA_KEY),"error generating tmp rsa key"}, {ERR_REASON(SSL_R_ERROR_IN_RECEIVED_CIPHER_LIST),"error in received cipher list"}, {ERR_REASON(SSL_R_EXCESSIVE_MESSAGE_SIZE),"excessive message size"}, {ERR_REASON(SSL_R_EXTRA_DATA_IN_MESSAGE) ,"extra data in message"}, {ERR_REASON(SSL_R_GOT_A_FIN_BEFORE_A_CCS),"got a fin before a ccs"}, +{ERR_REASON(SSL_R_GOT_NEXT_PROTO_BEFORE_A_CCS),"got next proto before a ccs"}, +{ERR_REASON(SSL_R_GOT_NEXT_PROTO_WITHOUT_EXTENSION),"got next proto without seeing extension"}, {ERR_REASON(SSL_R_HTTPS_PROXY_REQUEST) ,"https proxy request"}, {ERR_REASON(SSL_R_HTTP_REQUEST) ,"http request"}, {ERR_REASON(SSL_R_ILLEGAL_PADDING) ,"illegal padding"}, @@ -364,6 +386,7 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_INVALID_COMMAND) ,"invalid command"}, {ERR_REASON(SSL_R_INVALID_COMPRESSION_ALGORITHM),"invalid compression algorithm"}, {ERR_REASON(SSL_R_INVALID_PURPOSE) ,"invalid purpose"}, +{ERR_REASON(SSL_R_INVALID_SRP_USERNAME) ,"invalid srp username"}, {ERR_REASON(SSL_R_INVALID_STATUS_RESPONSE),"invalid status response"}, {ERR_REASON(SSL_R_INVALID_TICKET_KEYS_LENGTH),"invalid ticket keys length"}, {ERR_REASON(SSL_R_INVALID_TRUST) ,"invalid trust"}, @@ -393,6 +416,7 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_MISSING_RSA_CERTIFICATE),"missing rsa certificate"}, {ERR_REASON(SSL_R_MISSING_RSA_ENCRYPTING_CERT),"missing rsa encrypting cert"}, {ERR_REASON(SSL_R_MISSING_RSA_SIGNING_CERT),"missing rsa signing cert"}, +{ERR_REASON(SSL_R_MISSING_SRP_PARAM) ,"can't find SRP server param"}, {ERR_REASON(SSL_R_MISSING_TMP_DH_KEY) ,"missing tmp dh key"}, {ERR_REASON(SSL_R_MISSING_TMP_ECDH_KEY) ,"missing tmp ecdh key"}, {ERR_REASON(SSL_R_MISSING_TMP_RSA_KEY) ,"missing tmp rsa key"}, @@ -422,6 +446,7 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_NO_RENEGOTIATION) ,"no renegotiation"}, {ERR_REASON(SSL_R_NO_REQUIRED_DIGEST) ,"digest requred for handshake isn't computed"}, {ERR_REASON(SSL_R_NO_SHARED_CIPHER) ,"no shared cipher"}, +{ERR_REASON(SSL_R_NO_SRTP_PROFILES) ,"no srtp profiles"}, {ERR_REASON(SSL_R_NO_VERIFY_CALLBACK) ,"no verify callback"}, {ERR_REASON(SSL_R_NULL_SSL_CTX) ,"null ssl ctx"}, {ERR_REASON(SSL_R_NULL_SSL_METHOD_PASSED),"null ssl method passed"}, @@ -465,7 +490,12 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_SERVERHELLO_TLSEXT) ,"serverhello tlsext"}, {ERR_REASON(SSL_R_SESSION_ID_CONTEXT_UNINITIALIZED),"session id context uninitialized"}, {ERR_REASON(SSL_R_SHORT_READ) ,"short read"}, +{ERR_REASON(SSL_R_SIGNATURE_ALGORITHMS_ERROR),"signature algorithms error"}, {ERR_REASON(SSL_R_SIGNATURE_FOR_NON_SIGNING_CERTIFICATE),"signature for non signing certificate"}, +{ERR_REASON(SSL_R_SRP_A_CALC) ,"error with the srp params"}, +{ERR_REASON(SSL_R_SRTP_COULD_NOT_ALLOCATE_PROFILES),"srtp could not allocate profiles"}, +{ERR_REASON(SSL_R_SRTP_PROTECTION_PROFILE_LIST_TOO_LONG),"srtp protection profile list too long"}, +{ERR_REASON(SSL_R_SRTP_UNKNOWN_PROTECTION_PROFILE),"srtp unknown protection profile"}, {ERR_REASON(SSL_R_SSL23_DOING_SESSION_ID_REUSE),"ssl23 doing session id reuse"}, {ERR_REASON(SSL_R_SSL2_CONNECTION_ID_TOO_LONG),"ssl2 connection id too long"}, {ERR_REASON(SSL_R_SSL3_EXT_INVALID_ECPOINTFORMAT),"ssl3 ext invalid ecpointformat"}, @@ -510,6 +540,9 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_TLSV1_UNRECOGNIZED_NAME),"tlsv1 unrecognized name"}, {ERR_REASON(SSL_R_TLSV1_UNSUPPORTED_EXTENSION),"tlsv1 unsupported extension"}, {ERR_REASON(SSL_R_TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER),"tls client cert req with anon cipher"}, +{ERR_REASON(SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT),"peer does not accept heartbearts"}, +{ERR_REASON(SSL_R_TLS_HEARTBEAT_PENDING) ,"heartbeat request already pending"}, +{ERR_REASON(SSL_R_TLS_ILLEGAL_EXPORTER_LABEL),"tls illegal exporter label"}, {ERR_REASON(SSL_R_TLS_INVALID_ECPOINTFORMAT_LIST),"tls invalid ecpointformat list"}, {ERR_REASON(SSL_R_TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST),"tls peer did not respond with certificate list"}, {ERR_REASON(SSL_R_TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG),"tls rsa encrypted value length is wrong"}, @@ -531,6 +564,7 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_UNKNOWN_CERTIFICATE_TYPE),"unknown certificate type"}, {ERR_REASON(SSL_R_UNKNOWN_CIPHER_RETURNED),"unknown cipher returned"}, {ERR_REASON(SSL_R_UNKNOWN_CIPHER_TYPE) ,"unknown cipher type"}, +{ERR_REASON(SSL_R_UNKNOWN_DIGEST) ,"unknown digest"}, {ERR_REASON(SSL_R_UNKNOWN_KEY_EXCHANGE_TYPE),"unknown key exchange type"}, {ERR_REASON(SSL_R_UNKNOWN_PKEY_TYPE) ,"unknown pkey type"}, {ERR_REASON(SSL_R_UNKNOWN_PROTOCOL) ,"unknown protocol"}, @@ -545,12 +579,14 @@ static ERR_STRING_DATA SSL_str_reasons[]= {ERR_REASON(SSL_R_UNSUPPORTED_PROTOCOL) ,"unsupported protocol"}, {ERR_REASON(SSL_R_UNSUPPORTED_SSL_VERSION),"unsupported ssl version"}, {ERR_REASON(SSL_R_UNSUPPORTED_STATUS_TYPE),"unsupported status type"}, +{ERR_REASON(SSL_R_USE_SRTP_NOT_NEGOTIATED),"use srtp not negotiated"}, {ERR_REASON(SSL_R_WRITE_BIO_NOT_SET) ,"write bio not set"}, {ERR_REASON(SSL_R_WRONG_CIPHER_RETURNED) ,"wrong cipher returned"}, {ERR_REASON(SSL_R_WRONG_MESSAGE_TYPE) ,"wrong message type"}, {ERR_REASON(SSL_R_WRONG_NUMBER_OF_KEY_BITS),"wrong number of key bits"}, {ERR_REASON(SSL_R_WRONG_SIGNATURE_LENGTH),"wrong signature length"}, {ERR_REASON(SSL_R_WRONG_SIGNATURE_SIZE) ,"wrong signature size"}, +{ERR_REASON(SSL_R_WRONG_SIGNATURE_TYPE) ,"wrong signature type"}, {ERR_REASON(SSL_R_WRONG_SSL_VERSION) ,"wrong ssl version"}, {ERR_REASON(SSL_R_WRONG_VERSION_NUMBER) ,"wrong version number"}, {ERR_REASON(SSL_R_X509_LIB) ,"x509 lib"}, diff --git a/src/lib/libssl/ssl_lib.c b/src/lib/libssl/ssl_lib.c index 8e89911f48..f82d071d6e 100644 --- a/src/lib/libssl/ssl_lib.c +++ b/src/lib/libssl/ssl_lib.c @@ -176,7 +176,10 @@ SSL3_ENC_METHOD ssl3_undef_enc_method={ 0, /* client_finished_label_len */ NULL, /* server_finished_label */ 0, /* server_finished_label_len */ - (int (*)(int))ssl_undefined_function + (int (*)(int))ssl_undefined_function, + (int (*)(SSL *, unsigned char *, size_t, const char *, + size_t, const unsigned char *, size_t, + int use_context)) ssl_undefined_function, }; int SSL_clear(SSL *s) @@ -202,9 +205,9 @@ int SSL_clear(SSL *s) * needed because SSL_clear is not called when doing renegotiation) */ /* This is set if we are doing dynamic renegotiation so keep * the old cipher. It is sort of a SSL_clear_lite :-) */ - if (s->new_session) return(1); + if (s->renegotiate) return(1); #else - if (s->new_session) + if (s->renegotiate) { SSLerr(SSL_F_SSL_CLEAR,ERR_R_INTERNAL_ERROR); return 0; @@ -353,6 +356,9 @@ SSL *SSL_new(SSL_CTX *ctx) s->tlsext_ocsp_resplen = -1; CRYPTO_add(&ctx->references,1,CRYPTO_LOCK_SSL_CTX); s->initial_ctx=ctx; +# ifndef OPENSSL_NO_NEXTPROTONEG + s->next_proto_negotiated = NULL; +# endif #endif s->verify_result=X509_V_OK; @@ -586,6 +592,14 @@ void SSL_free(SSL *s) kssl_ctx_free(s->kssl_ctx); #endif /* OPENSSL_NO_KRB5 */ +#if !defined(OPENSSL_NO_TLSEXT) && !defined(OPENSSL_NO_NEXTPROTONEG) + if (s->next_proto_negotiated) + OPENSSL_free(s->next_proto_negotiated); +#endif + + if (s->srtp_profiles) + sk_SRTP_PROTECTION_PROFILE_free(s->srtp_profiles); + OPENSSL_free(s); } @@ -1008,10 +1022,21 @@ int SSL_shutdown(SSL *s) int SSL_renegotiate(SSL *s) { - if (s->new_session == 0) - { - s->new_session=1; - } + if (s->renegotiate == 0) + s->renegotiate=1; + + s->new_session=1; + + return(s->method->ssl_renegotiate(s)); + } + +int SSL_renegotiate_abbreviated(SSL *s) + { + if (s->renegotiate == 0) + s->renegotiate=1; + + s->new_session=0; + return(s->method->ssl_renegotiate(s)); } @@ -1019,7 +1044,7 @@ int SSL_renegotiate_pending(SSL *s) { /* becomes true when negotiation is requested; * false again once a handshake has finished */ - return (s->new_session != 0); + return (s->renegotiate != 0); } long SSL_ctrl(SSL *s,int cmd,long larg,void *parg) @@ -1054,8 +1079,10 @@ long SSL_ctrl(SSL *s,int cmd,long larg,void *parg) s->max_cert_list=larg; return(l); case SSL_CTRL_SET_MTU: +#ifndef OPENSSL_NO_DTLS1 if (larg < (long)dtls1_min_mtu()) return 0; +#endif if (SSL_version(s) == DTLS1_VERSION || SSL_version(s) == DTLS1_BAD_VER) @@ -1358,6 +1385,10 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p, for (i=0; ialgorithm_ssl & SSL_TLSV1_2) && + (TLS1_get_client_version(s) < TLS1_2_VERSION)) + continue; #ifndef OPENSSL_NO_KRB5 if (((c->algorithm_mkey & SSL_kKRB5) || (c->algorithm_auth & SSL_aKRB5)) && nokrb5) @@ -1375,7 +1406,7 @@ int ssl_cipher_list_to_bytes(SSL *s,STACK_OF(SSL_CIPHER) *sk,unsigned char *p, /* If p == q, no ciphers and caller indicates an error. Otherwise * add SCSV if not renegotiating. */ - if (p != q && !s->new_session) + if (p != q && !s->renegotiate) { static SSL_CIPHER scsv = { @@ -1422,7 +1453,7 @@ STACK_OF(SSL_CIPHER) *ssl_bytes_to_cipher_list(SSL *s,unsigned char *p,int num, (p[n-1] == (SSL3_CK_SCSV & 0xff))) { /* SCSV fatal if renegotiating */ - if (s->new_session) + if (s->renegotiate) { SSLerr(SSL_F_SSL_BYTES_TO_CIPHER_LIST,SSL_R_SCSV_RECEIVED_WHEN_RENEGOTIATING); ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_HANDSHAKE_FAILURE); @@ -1479,8 +1510,137 @@ int SSL_get_servername_type(const SSL *s) return TLSEXT_NAMETYPE_host_name; return -1; } + +# ifndef OPENSSL_NO_NEXTPROTONEG +/* SSL_select_next_proto implements the standard protocol selection. It is + * expected that this function is called from the callback set by + * SSL_CTX_set_next_proto_select_cb. + * + * The protocol data is assumed to be a vector of 8-bit, length prefixed byte + * strings. The length byte itself is not included in the length. A byte + * string of length 0 is invalid. No byte string may be truncated. + * + * The current, but experimental algorithm for selecting the protocol is: + * + * 1) If the server doesn't support NPN then this is indicated to the + * callback. In this case, the client application has to abort the connection + * or have a default application level protocol. + * + * 2) If the server supports NPN, but advertises an empty list then the + * client selects the first protcol in its list, but indicates via the + * API that this fallback case was enacted. + * + * 3) Otherwise, the client finds the first protocol in the server's list + * that it supports and selects this protocol. This is because it's + * assumed that the server has better information about which protocol + * a client should use. + * + * 4) If the client doesn't support any of the server's advertised + * protocols, then this is treated the same as case 2. + * + * It returns either + * OPENSSL_NPN_NEGOTIATED if a common protocol was found, or + * OPENSSL_NPN_NO_OVERLAP if the fallback case was reached. + */ +int SSL_select_next_proto(unsigned char **out, unsigned char *outlen, const unsigned char *server, unsigned int server_len, const unsigned char *client, unsigned int client_len) + { + unsigned int i, j; + const unsigned char *result; + int status = OPENSSL_NPN_UNSUPPORTED; + + /* For each protocol in server preference order, see if we support it. */ + for (i = 0; i < server_len; ) + { + for (j = 0; j < client_len; ) + { + if (server[i] == client[j] && + memcmp(&server[i+1], &client[j+1], server[i]) == 0) + { + /* We found a match */ + result = &server[i]; + status = OPENSSL_NPN_NEGOTIATED; + goto found; + } + j += client[j]; + j++; + } + i += server[i]; + i++; + } + + /* There's no overlap between our protocols and the server's list. */ + result = client; + status = OPENSSL_NPN_NO_OVERLAP; + + found: + *out = (unsigned char *) result + 1; + *outlen = result[0]; + return status; + } + +/* SSL_get0_next_proto_negotiated sets *data and *len to point to the client's + * requested protocol for this connection and returns 0. If the client didn't + * request any protocol, then *data is set to NULL. + * + * Note that the client can request any protocol it chooses. The value returned + * from this function need not be a member of the list of supported protocols + * provided by the callback. + */ +void SSL_get0_next_proto_negotiated(const SSL *s, const unsigned char **data, unsigned *len) + { + *data = s->next_proto_negotiated; + if (!*data) { + *len = 0; + } else { + *len = s->next_proto_negotiated_len; + } +} + +/* SSL_CTX_set_next_protos_advertised_cb sets a callback that is called when a + * TLS server needs a list of supported protocols for Next Protocol + * Negotiation. The returned list must be in wire format. The list is returned + * by setting |out| to point to it and |outlen| to its length. This memory will + * not be modified, but one should assume that the SSL* keeps a reference to + * it. + * + * The callback should return SSL_TLSEXT_ERR_OK if it wishes to advertise. Otherwise, no + * such extension will be included in the ServerHello. */ +void SSL_CTX_set_next_protos_advertised_cb(SSL_CTX *ctx, int (*cb) (SSL *ssl, const unsigned char **out, unsigned int *outlen, void *arg), void *arg) + { + ctx->next_protos_advertised_cb = cb; + ctx->next_protos_advertised_cb_arg = arg; + } + +/* SSL_CTX_set_next_proto_select_cb sets a callback that is called when a + * client needs to select a protocol from the server's provided list. |out| + * must be set to point to the selected protocol (which may be within |in|). + * The length of the protocol name must be written into |outlen|. The server's + * advertised protocols are provided in |in| and |inlen|. The callback can + * assume that |in| is syntactically valid. + * + * The client must select a protocol. It is fatal to the connection if this + * callback returns a value other than SSL_TLSEXT_ERR_OK. + */ +void SSL_CTX_set_next_proto_select_cb(SSL_CTX *ctx, int (*cb) (SSL *s, unsigned char **out, unsigned char *outlen, const unsigned char *in, unsigned int inlen, void *arg), void *arg) + { + ctx->next_proto_select_cb = cb; + ctx->next_proto_select_cb_arg = arg; + } +# endif #endif +int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen, + const char *label, size_t llen, const unsigned char *p, size_t plen, + int use_context) + { + if (s->version < TLS1_VERSION) + return -1; + + return s->method->ssl3_enc->export_keying_material(s, out, olen, label, + llen, p, plen, + use_context); + } + static unsigned long ssl_session_hash(const SSL_SESSION *a) { unsigned long l; @@ -1524,6 +1684,14 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth) return(NULL); } +#ifdef OPENSSL_FIPS + if (FIPS_mode() && (meth->version < TLS1_VERSION)) + { + SSLerr(SSL_F_SSL_CTX_NEW, SSL_R_ONLY_TLS_ALLOWED_IN_FIPS_MODE); + return NULL; + } +#endif + if (SSL_get_ex_data_X509_STORE_CTX_idx() < 0) { SSLerr(SSL_F_SSL_CTX_NEW,SSL_R_X509_VERIFICATION_SETUP_PROBLEMS); @@ -1643,12 +1811,19 @@ SSL_CTX *SSL_CTX_new(const SSL_METHOD *meth) ret->tlsext_status_cb = 0; ret->tlsext_status_arg = NULL; +# ifndef OPENSSL_NO_NEXTPROTONEG + ret->next_protos_advertised_cb = 0; + ret->next_proto_select_cb = 0; +# endif #endif #ifndef OPENSSL_NO_PSK ret->psk_identity_hint=NULL; ret->psk_client_callback=NULL; ret->psk_server_callback=NULL; #endif +#ifndef OPENSSL_NO_SRP + SSL_CTX_SRP_CTX_init(ret); +#endif #ifndef OPENSSL_NO_BUF_FREELISTS ret->freelist_max_len = SSL_MAX_BUF_FREELIST_LEN_DEFAULT; ret->rbuf_freelist = OPENSSL_malloc(sizeof(SSL3_BUF_FREELIST)); @@ -1777,10 +1952,16 @@ void SSL_CTX_free(SSL_CTX *a) a->comp_methods = NULL; #endif + if (a->srtp_profiles) + sk_SRTP_PROTECTION_PROFILE_free(a->srtp_profiles); + #ifndef OPENSSL_NO_PSK if (a->psk_identity_hint) OPENSSL_free(a->psk_identity_hint); #endif +#ifndef OPENSSL_NO_SRP + SSL_CTX_SRP_CTX_free(a); +#endif #ifndef OPENSSL_NO_ENGINE if (a->client_cert_engine) ENGINE_finish(a->client_cert_engine); @@ -2034,12 +2215,13 @@ void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher) #ifndef OPENSSL_NO_EC -int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs) +int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s) { unsigned long alg_k, alg_a; EVP_PKEY *pkey = NULL; int keysize = 0; int signature_nid = 0, md_nid = 0, pk_nid = 0; + const SSL_CIPHER *cs = s->s3->tmp.new_cipher; alg_k = cs->algorithm_mkey; alg_a = cs->algorithm_auth; @@ -2069,7 +2251,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs) SSLerr(SSL_F_SSL_CHECK_SRVR_ECC_CERT_AND_ALG, SSL_R_ECC_CERT_NOT_FOR_KEY_AGREEMENT); return 0; } - if (alg_k & SSL_kECDHe) + if ((alg_k & SSL_kECDHe) && TLS1_get_version(s) < TLS1_2_VERSION) { /* signature alg must be ECDSA */ if (pk_nid != NID_X9_62_id_ecPublicKey) @@ -2078,7 +2260,7 @@ int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs) return 0; } } - if (alg_k & SSL_kECDHr) + if ((alg_k & SSL_kECDHr) && TLS1_get_version(s) < TLS1_2_VERSION) { /* signature alg must be RSA */ @@ -2168,34 +2350,36 @@ X509 *ssl_get_server_send_cert(SSL *s) return(c->pkeys[i].x509); } -EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher) +EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *cipher, const EVP_MD **pmd) { unsigned long alg_a; CERT *c; + int idx = -1; alg_a = cipher->algorithm_auth; c=s->cert; if ((alg_a & SSL_aDSS) && (c->pkeys[SSL_PKEY_DSA_SIGN].privatekey != NULL)) - return(c->pkeys[SSL_PKEY_DSA_SIGN].privatekey); + idx = SSL_PKEY_DSA_SIGN; else if (alg_a & SSL_aRSA) { if (c->pkeys[SSL_PKEY_RSA_SIGN].privatekey != NULL) - return(c->pkeys[SSL_PKEY_RSA_SIGN].privatekey); + idx = SSL_PKEY_RSA_SIGN; else if (c->pkeys[SSL_PKEY_RSA_ENC].privatekey != NULL) - return(c->pkeys[SSL_PKEY_RSA_ENC].privatekey); - else - return(NULL); + idx = SSL_PKEY_RSA_ENC; } else if ((alg_a & SSL_aECDSA) && (c->pkeys[SSL_PKEY_ECC].privatekey != NULL)) - return(c->pkeys[SSL_PKEY_ECC].privatekey); - else /* if (alg_a & SSL_aNULL) */ + idx = SSL_PKEY_ECC; + if (idx == -1) { SSLerr(SSL_F_SSL_GET_SIGN_PKEY,ERR_R_INTERNAL_ERROR); return(NULL); } + if (pmd) + *pmd = c->pkeys[idx].digest; + return c->pkeys[idx].privatekey; } void ssl_update_cache(SSL *s,int mode) @@ -2420,6 +2604,10 @@ SSL_METHOD *ssl_bad_method(int ver) const char *SSL_get_version(const SSL *s) { + if (s->version == TLS1_2_VERSION) + return("TLSv1.2"); + else if (s->version == TLS1_1_VERSION) + return("TLSv1.1"); if (s->version == TLS1_VERSION) return("TLSv1"); else if (s->version == SSL3_VERSION) @@ -2514,6 +2702,7 @@ SSL *SSL_dup(SSL *s) ret->in_handshake = s->in_handshake; ret->handshake_func = s->handshake_func; ret->server = s->server; + ret->renegotiate = s->renegotiate; ret->new_session = s->new_session; ret->quiet_shutdown = s->quiet_shutdown; ret->shutdown=s->shutdown; @@ -2779,6 +2968,11 @@ int SSL_state(const SSL *ssl) return(ssl->state); } +void SSL_set_state(SSL *ssl, int state) + { + ssl->state = state; + } + void SSL_set_verify_result(SSL *ssl,long arg) { ssl->verify_result=arg; @@ -3037,6 +3231,16 @@ void ssl_clear_hash_ctx(EVP_MD_CTX **hash) *hash=NULL; } +void SSL_set_debug(SSL *s, int debug) + { + s->debug = debug; + } + +int SSL_cache_hit(SSL *s) + { + return s->hit; + } + #if defined(_WINDLL) && defined(OPENSSL_SYS_WIN16) #include "../crypto/bio/bss_file.c" #endif @@ -3045,4 +3249,3 @@ IMPLEMENT_STACK_OF(SSL_CIPHER) IMPLEMENT_STACK_OF(SSL_COMP) IMPLEMENT_OBJ_BSEARCH_GLOBAL_CMP_FN(SSL_CIPHER, SSL_CIPHER, ssl_cipher_id); - diff --git a/src/lib/libssl/ssl_locl.h b/src/lib/libssl/ssl_locl.h index cea622a2a6..d87fd51cfa 100644 --- a/src/lib/libssl/ssl_locl.h +++ b/src/lib/libssl/ssl_locl.h @@ -170,7 +170,7 @@ # define OPENSSL_EXTERN OPENSSL_EXPORT #endif -#define PKCS1_CHECK +#undef PKCS1_CHECK #define c2l(c,l) (l = ((unsigned long)(*((c)++))) , \ l|=(((unsigned long)(*((c)++)))<< 8), \ @@ -289,6 +289,7 @@ #define SSL_kEECDH 0x00000080L /* ephemeral ECDH */ #define SSL_kPSK 0x00000100L /* PSK */ #define SSL_kGOST 0x00000200L /* GOST key exchange */ +#define SSL_kSRP 0x00000400L /* SRP */ /* Bits for algorithm_auth (server authentication) */ #define SSL_aRSA 0x00000001L /* RSA auth */ @@ -316,21 +317,29 @@ #define SSL_CAMELLIA256 0x00000200L #define SSL_eGOST2814789CNT 0x00000400L #define SSL_SEED 0x00000800L +#define SSL_AES128GCM 0x00001000L +#define SSL_AES256GCM 0x00002000L -#define SSL_AES (SSL_AES128|SSL_AES256) +#define SSL_AES (SSL_AES128|SSL_AES256|SSL_AES128GCM|SSL_AES256GCM) #define SSL_CAMELLIA (SSL_CAMELLIA128|SSL_CAMELLIA256) /* Bits for algorithm_mac (symmetric authentication) */ + #define SSL_MD5 0x00000001L #define SSL_SHA1 0x00000002L #define SSL_GOST94 0x00000004L #define SSL_GOST89MAC 0x00000008L +#define SSL_SHA256 0x00000010L +#define SSL_SHA384 0x00000020L +/* Not a real MAC, just an indication it is part of cipher */ +#define SSL_AEAD 0x00000040L /* Bits for algorithm_ssl (protocol version) */ #define SSL_SSLV2 0x00000001L #define SSL_SSLV3 0x00000002L #define SSL_TLSV1 SSL_SSLV3 /* for now */ +#define SSL_TLSV1_2 0x00000004L /* Bits for algorithm2 (handshake digests and other extra flags) */ @@ -338,15 +347,21 @@ #define SSL_HANDSHAKE_MAC_MD5 0x10 #define SSL_HANDSHAKE_MAC_SHA 0x20 #define SSL_HANDSHAKE_MAC_GOST94 0x40 +#define SSL_HANDSHAKE_MAC_SHA256 0x80 +#define SSL_HANDSHAKE_MAC_SHA384 0x100 #define SSL_HANDSHAKE_MAC_DEFAULT (SSL_HANDSHAKE_MAC_MD5 | SSL_HANDSHAKE_MAC_SHA) /* When adding new digest in the ssl_ciph.c and increment SSM_MD_NUM_IDX * make sure to update this constant too */ -#define SSL_MAX_DIGEST 4 +#define SSL_MAX_DIGEST 6 + +#define TLS1_PRF_DGST_MASK (0xff << TLS1_PRF_DGST_SHIFT) -#define TLS1_PRF_DGST_SHIFT 8 +#define TLS1_PRF_DGST_SHIFT 10 #define TLS1_PRF_MD5 (SSL_HANDSHAKE_MAC_MD5 << TLS1_PRF_DGST_SHIFT) #define TLS1_PRF_SHA1 (SSL_HANDSHAKE_MAC_SHA << TLS1_PRF_DGST_SHIFT) +#define TLS1_PRF_SHA256 (SSL_HANDSHAKE_MAC_SHA256 << TLS1_PRF_DGST_SHIFT) +#define TLS1_PRF_SHA384 (SSL_HANDSHAKE_MAC_SHA384 << TLS1_PRF_DGST_SHIFT) #define TLS1_PRF_GOST94 (SSL_HANDSHAKE_MAC_GOST94 << TLS1_PRF_DGST_SHIFT) #define TLS1_PRF (TLS1_PRF_MD5 | TLS1_PRF_SHA1) @@ -457,6 +472,8 @@ typedef struct cert_pkey_st { X509 *x509; EVP_PKEY *privatekey; + /* Digest to use when signing */ + const EVP_MD *digest; } CERT_PKEY; typedef struct cert_st @@ -554,6 +571,10 @@ typedef struct ssl3_enc_method const char *server_finished_label; int server_finished_label_len; int (*alert_value)(int); + int (*export_keying_material)(SSL *, unsigned char *, size_t, + const char *, size_t, + const unsigned char *, size_t, + int use_context); } SSL3_ENC_METHOD; #ifndef OPENSSL_NO_COMP @@ -591,11 +612,12 @@ extern SSL3_ENC_METHOD TLSv1_enc_data; extern SSL3_ENC_METHOD SSLv3_enc_data; extern SSL3_ENC_METHOD DTLSv1_enc_data; -#define IMPLEMENT_tls1_meth_func(func_name, s_accept, s_connect, s_get_meth) \ +#define IMPLEMENT_tls_meth_func(version, func_name, s_accept, s_connect, \ + s_get_meth) \ const SSL_METHOD *func_name(void) \ { \ static const SSL_METHOD func_name##_data= { \ - TLS1_VERSION, \ + version, \ tls1_new, \ tls1_clear, \ tls1_free, \ @@ -669,7 +691,7 @@ const SSL_METHOD *func_name(void) \ const SSL_METHOD *func_name(void) \ { \ static const SSL_METHOD func_name##_data= { \ - TLS1_VERSION, \ + TLS1_2_VERSION, \ tls1_new, \ tls1_clear, \ tls1_free, \ @@ -752,7 +774,7 @@ const SSL_METHOD *func_name(void) \ ssl3_read, \ ssl3_peek, \ ssl3_write, \ - ssl3_shutdown, \ + dtls1_shutdown, \ ssl3_renegotiate, \ ssl3_renegotiate_check, \ dtls1_get_message, \ @@ -809,7 +831,7 @@ int ssl_undefined_function(SSL *s); int ssl_undefined_void_function(void); int ssl_undefined_const_function(const SSL *s); X509 *ssl_get_server_send_cert(SSL *); -EVP_PKEY *ssl_get_sign_pkey(SSL *,const SSL_CIPHER *); +EVP_PKEY *ssl_get_sign_pkey(SSL *s,const SSL_CIPHER *c, const EVP_MD **pmd); int ssl_cert_type(X509 *x,EVP_PKEY *pkey); void ssl_set_cert_masks(CERT *c, const SSL_CIPHER *cipher); STACK_OF(SSL_CIPHER) *ssl_get_ciphers_by_id(SSL *s); @@ -943,6 +965,7 @@ void dtls1_get_ccs_header(unsigned char *data, struct ccs_header_st *ccs_hdr); void dtls1_reset_seq_numbers(SSL *s, int rw); long dtls1_default_timeout(void); struct timeval* dtls1_get_timeout(SSL *s, struct timeval* timeleft); +int dtls1_check_timeout_num(SSL *s); int dtls1_handle_timeout(SSL *s); const SSL_CIPHER *dtls1_get_cipher(unsigned int u); void dtls1_start_timer(SSL *s); @@ -968,6 +991,9 @@ int ssl3_get_server_certificate(SSL *s); int ssl3_check_cert_and_algorithm(SSL *s); #ifndef OPENSSL_NO_TLSEXT int ssl3_check_finished(SSL *s); +# ifndef OPENSSL_NO_NEXTPROTONEG +int ssl3_send_next_proto(SSL *s); +# endif #endif int dtls1_client_hello(SSL *s); @@ -986,6 +1012,9 @@ int ssl3_check_client_hello(SSL *s); int ssl3_get_client_certificate(SSL *s); int ssl3_get_client_key_exchange(SSL *s); int ssl3_get_cert_verify(SSL *s); +#ifndef OPENSSL_NO_NEXTPROTONEG +int ssl3_get_next_proto(SSL *s); +#endif int dtls1_send_hello_request(SSL *s); int dtls1_send_server_hello(SSL *s); @@ -1013,6 +1042,7 @@ int dtls1_connect(SSL *s); void dtls1_free(SSL *s); void dtls1_clear(SSL *s); long dtls1_ctrl(SSL *s,int cmd, long larg, void *parg); +int dtls1_shutdown(SSL *s); long dtls1_get_message(SSL *s, int st1, int stn, int mt, long max, int *ok); int dtls1_get_record(SSL *s); @@ -1033,12 +1063,15 @@ int tls1_cert_verify_mac(SSL *s, int md_nid, unsigned char *p); int tls1_mac(SSL *ssl, unsigned char *md, int snd); int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p, int len); +int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen, + const char *label, size_t llen, + const unsigned char *p, size_t plen, int use_context); int tls1_alert_code(int code); int ssl3_alert_code(int code); int ssl_ok(SSL *s); #ifndef OPENSSL_NO_ECDH -int ssl_check_srvr_ecc_cert_and_alg(X509 *x, const SSL_CIPHER *cs); +int ssl_check_srvr_ecc_cert_and_alg(X509 *x, SSL *s); #endif SSL_COMP *ssl3_comp_find(STACK_OF(SSL_COMP) *sk, int n); @@ -1058,6 +1091,13 @@ int ssl_prepare_serverhello_tlsext(SSL *s); int ssl_check_clienthello_tlsext(SSL *s); int ssl_check_serverhello_tlsext(SSL *s); +#ifndef OPENSSL_NO_HEARTBEATS +int tls1_heartbeat(SSL *s); +int dtls1_heartbeat(SSL *s); +int tls1_process_heartbeat(SSL *s); +int dtls1_process_heartbeat(SSL *s); +#endif + #ifdef OPENSSL_NO_SHA256 #define tlsext_tick_md EVP_sha1 #else @@ -1065,6 +1105,12 @@ int ssl_check_serverhello_tlsext(SSL *s); #endif int tls1_process_ticket(SSL *s, unsigned char *session_id, int len, const unsigned char *limit, SSL_SESSION **ret); + +int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, + const EVP_MD *md); +int tls12_get_sigid(const EVP_PKEY *pk); +const EVP_MD *tls12_get_hash(unsigned char hash_alg); + #endif EVP_MD_CTX* ssl_replace_hash(EVP_MD_CTX **hash,const EVP_MD *md) ; void ssl_clear_hash_ctx(EVP_MD_CTX **hash); @@ -1076,4 +1122,13 @@ int ssl_add_clienthello_renegotiate_ext(SSL *s, unsigned char *p, int *len, int maxlen); int ssl_parse_clienthello_renegotiate_ext(SSL *s, unsigned char *d, int len, int *al); +long ssl_get_algorithm2(SSL *s); +int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize); +int tls12_get_req_sig_algs(SSL *s, unsigned char *p); + +int ssl_add_clienthello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen); +int ssl_parse_clienthello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al); +int ssl_add_serverhello_use_srtp_ext(SSL *s, unsigned char *p, int *len, int maxlen); +int ssl_parse_serverhello_use_srtp_ext(SSL *s, unsigned char *d, int len,int *al); + #endif diff --git a/src/lib/libssl/ssl_sess.c b/src/lib/libssl/ssl_sess.c index 8e5d8a0972..ad40fadd02 100644 --- a/src/lib/libssl/ssl_sess.c +++ b/src/lib/libssl/ssl_sess.c @@ -217,6 +217,9 @@ SSL_SESSION *SSL_SESSION_new(void) #ifndef OPENSSL_NO_PSK ss->psk_identity_hint=NULL; ss->psk_identity=NULL; +#endif +#ifndef OPENSSL_NO_SRP + ss->srp_username=NULL; #endif return(ss); } @@ -228,6 +231,11 @@ const unsigned char *SSL_SESSION_get_id(const SSL_SESSION *s, unsigned int *len) return s->session_id; } +unsigned int SSL_SESSION_get_compress_id(const SSL_SESSION *s) + { + return s->compress_meth; + } + /* Even with SSLv2, we have 16 bytes (128 bits) of session ID space. SSLv3/TLSv1 * has 32 bytes (256 bits). As such, filling the ID with random gunk repeatedly * until we have no conflict is going to complete in one iteration pretty much @@ -300,6 +308,16 @@ int ssl_get_new_session(SSL *s, int session) ss->ssl_version=TLS1_VERSION; ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH; } + else if (s->version == TLS1_1_VERSION) + { + ss->ssl_version=TLS1_1_VERSION; + ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH; + } + else if (s->version == TLS1_2_VERSION) + { + ss->ssl_version=TLS1_2_VERSION; + ss->session_id_length=SSL3_SSL_SESSION_ID_LENGTH; + } else if (s->version == DTLS1_BAD_VER) { ss->ssl_version=DTLS1_BAD_VER; @@ -423,6 +441,25 @@ int ssl_get_new_session(SSL *s, int session) return(1); } +/* ssl_get_prev attempts to find an SSL_SESSION to be used to resume this + * connection. It is only called by servers. + * + * session_id: points at the session ID in the ClientHello. This code will + * read past the end of this in order to parse out the session ticket + * extension, if any. + * len: the length of the session ID. + * limit: a pointer to the first byte after the ClientHello. + * + * Returns: + * -1: error + * 0: a session may have been found. + * + * Side effects: + * - If a session is found then s->session is pointed at it (after freeing an + * existing session if need be) and s->verify_result is set from the session. + * - Both for new and resumed sessions, s->tlsext_ticket_expected is set to 1 + * if the server should issue a new session ticket (to 0 otherwise). + */ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len, const unsigned char *limit) { @@ -430,27 +467,39 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len, SSL_SESSION *ret=NULL; int fatal = 0; + int try_session_cache = 1; #ifndef OPENSSL_NO_TLSEXT int r; #endif if (len > SSL_MAX_SSL_SESSION_ID_LENGTH) goto err; + + if (len == 0) + try_session_cache = 0; + #ifndef OPENSSL_NO_TLSEXT - r = tls1_process_ticket(s, session_id, len, limit, &ret); - if (r == -1) + r = tls1_process_ticket(s, session_id, len, limit, &ret); /* sets s->tlsext_ticket_expected */ + switch (r) { + case -1: /* Error during processing */ fatal = 1; goto err; + case 0: /* No ticket found */ + case 1: /* Zero length ticket found */ + break; /* Ok to carry on processing session id. */ + case 2: /* Ticket found but not decrypted. */ + case 3: /* Ticket decrypted, *ret has been set. */ + try_session_cache = 0; + break; + default: + abort(); } - else if (r == 0 || (!ret && !len)) - goto err; - else if (!ret && !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP)) -#else - if (len == 0) - goto err; - if (!(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP)) #endif + + if (try_session_cache && + ret == NULL && + !(s->session_ctx->session_cache_mode & SSL_SESS_CACHE_NO_INTERNAL_LOOKUP)) { SSL_SESSION data; data.ssl_version=s->version; @@ -461,20 +510,22 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len, CRYPTO_r_lock(CRYPTO_LOCK_SSL_CTX); ret=lh_SSL_SESSION_retrieve(s->session_ctx->sessions,&data); if (ret != NULL) - /* don't allow other threads to steal it: */ - CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION); + { + /* don't allow other threads to steal it: */ + CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION); + } CRYPTO_r_unlock(CRYPTO_LOCK_SSL_CTX); + if (ret == NULL) + s->session_ctx->stats.sess_miss++; } - if (ret == NULL) + if (try_session_cache && + ret == NULL && + s->session_ctx->get_session_cb != NULL) { int copy=1; - s->session_ctx->stats.sess_miss++; - ret=NULL; - if (s->session_ctx->get_session_cb != NULL - && (ret=s->session_ctx->get_session_cb(s,session_id,len,©)) - != NULL) + if ((ret=s->session_ctx->get_session_cb(s,session_id,len,©))) { s->session_ctx->stats.sess_cb_hit++; @@ -493,23 +544,18 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len, * things are very strange */ SSL_CTX_add_session(s->session_ctx,ret); } - if (ret == NULL) - goto err; } - /* Now ret is non-NULL, and we own one of its reference counts. */ + if (ret == NULL) + goto err; + + /* Now ret is non-NULL and we own one of its reference counts. */ if (ret->sid_ctx_length != s->sid_ctx_length || memcmp(ret->sid_ctx,s->sid_ctx,ret->sid_ctx_length)) { - /* We've found the session named by the client, but we don't + /* We have the session requested by the client, but we don't * want to use it in this context. */ - -#if 0 /* The client cannot always know when a session is not appropriate, - * so we shouldn't generate an error message. */ - - SSLerr(SSL_F_SSL_GET_PREV_SESSION,SSL_R_ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT); -#endif goto err; /* treat like cache miss */ } @@ -546,39 +592,38 @@ int ssl_get_prev_session(SSL *s, unsigned char *session_id, int len, goto err; } - -#if 0 /* This is way too late. */ - - /* If a thread got the session, then 'swaped', and another got - * it and then due to a time-out decided to 'OPENSSL_free' it we could - * be in trouble. So I'll increment it now, then double decrement - * later - am I speaking rubbish?. */ - CRYPTO_add(&ret->references,1,CRYPTO_LOCK_SSL_SESSION); -#endif - if (ret->timeout < (long)(time(NULL) - ret->time)) /* timeout */ { s->session_ctx->stats.sess_timeout++; - /* remove it from the cache */ - SSL_CTX_remove_session(s->session_ctx,ret); + if (try_session_cache) + { + /* session was from the cache, so remove it */ + SSL_CTX_remove_session(s->session_ctx,ret); + } goto err; } s->session_ctx->stats.sess_hit++; - /* ret->time=time(NULL); */ /* rezero timeout? */ - /* again, just leave the session - * if it is the same session, we have just incremented and - * then decremented the reference count :-) */ if (s->session != NULL) SSL_SESSION_free(s->session); s->session=ret; s->verify_result = s->session->verify_result; - return(1); + return 1; err: if (ret != NULL) + { SSL_SESSION_free(ret); +#ifndef OPENSSL_NO_TLSEXT + if (!try_session_cache) + { + /* The session was from a ticket, so we should + * issue a ticket for the new session */ + s->tlsext_ticket_expected = 1; + } +#endif + } if (fatal) return -1; else @@ -728,6 +773,10 @@ void SSL_SESSION_free(SSL_SESSION *ss) OPENSSL_free(ss->psk_identity_hint); if (ss->psk_identity != NULL) OPENSSL_free(ss->psk_identity); +#endif +#ifndef OPENSSL_NO_SRP + if (ss->srp_username != NULL) + OPENSSL_free(ss->srp_username); #endif OPENSSL_cleanse(ss,sizeof(*ss)); OPENSSL_free(ss); @@ -753,10 +802,6 @@ int SSL_set_session(SSL *s, SSL_SESSION *session) { if (!SSL_set_ssl_method(s,meth)) return(0); - if (s->ctx->session_timeout == 0) - session->timeout=SSL_get_default_timeout(s); - else - session->timeout=s->ctx->session_timeout; } #ifndef OPENSSL_NO_KRB5 @@ -824,6 +869,25 @@ long SSL_SESSION_set_time(SSL_SESSION *s, long t) return(t); } +X509 *SSL_SESSION_get0_peer(SSL_SESSION *s) + { + return s->peer; + } + +int SSL_SESSION_set1_id_context(SSL_SESSION *s,const unsigned char *sid_ctx, + unsigned int sid_ctx_len) + { + if(sid_ctx_len > SSL_MAX_SID_CTX_LENGTH) + { + SSLerr(SSL_F_SSL_SESSION_SET1_ID_CONTEXT,SSL_R_SSL_SESSION_ID_CONTEXT_TOO_LONG); + return 0; + } + s->sid_ctx_length=sid_ctx_len; + memcpy(s->sid_ctx,sid_ctx,sid_ctx_len); + + return 1; + } + long SSL_CTX_set_timeout(SSL_CTX *s, long t) { long l; diff --git a/src/lib/libssl/ssl_txt.c b/src/lib/libssl/ssl_txt.c index 3122440e26..6479d52c0c 100644 --- a/src/lib/libssl/ssl_txt.c +++ b/src/lib/libssl/ssl_txt.c @@ -115,6 +115,10 @@ int SSL_SESSION_print(BIO *bp, const SSL_SESSION *x) s="SSLv2"; else if (x->ssl_version == SSL3_VERSION) s="SSLv3"; + else if (x->ssl_version == TLS1_2_VERSION) + s="TLSv1.2"; + else if (x->ssl_version == TLS1_1_VERSION) + s="TLSv1.1"; else if (x->ssl_version == TLS1_VERSION) s="TLSv1"; else if (x->ssl_version == DTLS1_VERSION) @@ -187,6 +191,10 @@ int SSL_SESSION_print(BIO *bp, const SSL_SESSION *x) if (BIO_puts(bp,"\n PSK identity hint: ") <= 0) goto err; if (BIO_printf(bp, "%s", x->psk_identity_hint ? x->psk_identity_hint : "None") <= 0) goto err; #endif +#ifndef OPENSSL_NO_SRP + if (BIO_puts(bp,"\n SRP username: ") <= 0) goto err; + if (BIO_printf(bp, "%s", x->srp_username ? x->srp_username : "None") <= 0) goto err; +#endif #ifndef OPENSSL_NO_TLSEXT if (x->tlsext_tick_lifetime_hint) { diff --git a/src/lib/libssl/t1_clnt.c b/src/lib/libssl/t1_clnt.c index c87af17712..578617ed84 100644 --- a/src/lib/libssl/t1_clnt.c +++ b/src/lib/libssl/t1_clnt.c @@ -66,13 +66,26 @@ static const SSL_METHOD *tls1_get_client_method(int ver); static const SSL_METHOD *tls1_get_client_method(int ver) { + if (ver == TLS1_2_VERSION) + return TLSv1_2_client_method(); + if (ver == TLS1_1_VERSION) + return TLSv1_1_client_method(); if (ver == TLS1_VERSION) - return(TLSv1_client_method()); - else - return(NULL); + return TLSv1_client_method(); + return NULL; } -IMPLEMENT_tls1_meth_func(TLSv1_client_method, +IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_client_method, + ssl_undefined_function, + ssl3_connect, + tls1_get_client_method) + +IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_client_method, + ssl_undefined_function, + ssl3_connect, + tls1_get_client_method) + +IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_client_method, ssl_undefined_function, ssl3_connect, tls1_get_client_method) diff --git a/src/lib/libssl/t1_enc.c b/src/lib/libssl/t1_enc.c index 793ea43e90..f7bdeb3b9d 100644 --- a/src/lib/libssl/t1_enc.c +++ b/src/lib/libssl/t1_enc.c @@ -143,6 +143,7 @@ #include #include #include +#include #ifdef KSSL_DEBUG #include #endif @@ -158,68 +159,75 @@ static int tls1_P_hash(const EVP_MD *md, const unsigned char *sec, unsigned char *out, int olen) { int chunk; - unsigned int j; - HMAC_CTX ctx; - HMAC_CTX ctx_tmp; + size_t j; + EVP_MD_CTX ctx, ctx_tmp; + EVP_PKEY *mac_key; unsigned char A1[EVP_MAX_MD_SIZE]; - unsigned int A1_len; + size_t A1_len; int ret = 0; chunk=EVP_MD_size(md); OPENSSL_assert(chunk >= 0); - HMAC_CTX_init(&ctx); - HMAC_CTX_init(&ctx_tmp); - if (!HMAC_Init_ex(&ctx,sec,sec_len,md, NULL)) + EVP_MD_CTX_init(&ctx); + EVP_MD_CTX_init(&ctx_tmp); + EVP_MD_CTX_set_flags(&ctx, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + EVP_MD_CTX_set_flags(&ctx_tmp, EVP_MD_CTX_FLAG_NON_FIPS_ALLOW); + mac_key = EVP_PKEY_new_mac_key(EVP_PKEY_HMAC, NULL, sec, sec_len); + if (!mac_key) + goto err; + if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key)) goto err; - if (!HMAC_Init_ex(&ctx_tmp,sec,sec_len,md, NULL)) + if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key)) goto err; - if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len)) + if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len)) goto err; - if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len)) + if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len)) goto err; - if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len)) + if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len)) goto err; - if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len)) + if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len)) goto err; - if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len)) + if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len)) goto err; - if (!HMAC_Final(&ctx,A1,&A1_len)) + if (!EVP_DigestSignFinal(&ctx,A1,&A1_len)) goto err; for (;;) { - if (!HMAC_Init_ex(&ctx,NULL,0,NULL,NULL)) /* re-init */ + /* Reinit mac contexts */ + if (!EVP_DigestSignInit(&ctx,NULL,md, NULL, mac_key)) goto err; - if (!HMAC_Init_ex(&ctx_tmp,NULL,0,NULL,NULL)) /* re-init */ + if (!EVP_DigestSignInit(&ctx_tmp,NULL,md, NULL, mac_key)) goto err; - if (!HMAC_Update(&ctx,A1,A1_len)) + if (!EVP_DigestSignUpdate(&ctx,A1,A1_len)) goto err; - if (!HMAC_Update(&ctx_tmp,A1,A1_len)) + if (!EVP_DigestSignUpdate(&ctx_tmp,A1,A1_len)) goto err; - if (seed1 != NULL && !HMAC_Update(&ctx,seed1,seed1_len)) + if (seed1 && !EVP_DigestSignUpdate(&ctx,seed1,seed1_len)) goto err; - if (seed2 != NULL && !HMAC_Update(&ctx,seed2,seed2_len)) + if (seed2 && !EVP_DigestSignUpdate(&ctx,seed2,seed2_len)) goto err; - if (seed3 != NULL && !HMAC_Update(&ctx,seed3,seed3_len)) + if (seed3 && !EVP_DigestSignUpdate(&ctx,seed3,seed3_len)) goto err; - if (seed4 != NULL && !HMAC_Update(&ctx,seed4,seed4_len)) + if (seed4 && !EVP_DigestSignUpdate(&ctx,seed4,seed4_len)) goto err; - if (seed5 != NULL && !HMAC_Update(&ctx,seed5,seed5_len)) + if (seed5 && !EVP_DigestSignUpdate(&ctx,seed5,seed5_len)) goto err; if (olen > chunk) { - if (!HMAC_Final(&ctx,out,&j)) + if (!EVP_DigestSignFinal(&ctx,out,&j)) goto err; out+=j; olen-=j; - if (!HMAC_Final(&ctx_tmp,A1,&A1_len)) /* calc the next A1 value */ + /* calc the next A1 value */ + if (!EVP_DigestSignFinal(&ctx_tmp,A1,&A1_len)) goto err; } else /* last one */ { - if (!HMAC_Final(&ctx,A1,&A1_len)) + if (!EVP_DigestSignFinal(&ctx,A1,&A1_len)) goto err; memcpy(out,A1,olen); break; @@ -227,8 +235,9 @@ static int tls1_P_hash(const EVP_MD *md, const unsigned char *sec, } ret = 1; err: - HMAC_CTX_cleanup(&ctx); - HMAC_CTX_cleanup(&ctx_tmp); + EVP_PKEY_free(mac_key); + EVP_MD_CTX_cleanup(&ctx); + EVP_MD_CTX_cleanup(&ctx_tmp); OPENSSL_cleanse(A1,sizeof(A1)); return ret; } @@ -256,6 +265,8 @@ static int tls1_PRF(long digest_mask, if ((m<s3->tmp.new_cipher->algorithm2, + ret = tls1_PRF(ssl_get_algorithm2(s), TLS_MD_KEY_EXPANSION_CONST,TLS_MD_KEY_EXPANSION_CONST_SIZE, s->s3->server_random,SSL3_RANDOM_SIZE, s->s3->client_random,SSL3_RANDOM_SIZE, @@ -358,7 +369,7 @@ int tls1_change_cipher_state(SSL *s, int which) { if (s->s3->tmp.new_cipher->algorithm2 & TLS1_STREAM_MAC) s->mac_flags |= SSL_MAC_FLAG_READ_MAC_STREAM; - else + else s->mac_flags &= ~SSL_MAC_FLAG_READ_MAC_STREAM; if (s->enc_read_ctx != NULL) @@ -445,7 +456,11 @@ int tls1_change_cipher_state(SSL *s, int which) j=is_export ? (cl < SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher) ? cl : SSL_C_EXPORT_KEYLENGTH(s->s3->tmp.new_cipher)) : cl; /* Was j=(exp)?5:EVP_CIPHER_key_length(c); */ - k=EVP_CIPHER_iv_length(c); + /* If GCM mode only part of IV comes from PRF */ + if (EVP_CIPHER_mode(c) == EVP_CIPH_GCM_MODE) + k = EVP_GCM_TLS_FIXED_IV_LEN; + else + k=EVP_CIPHER_iv_length(c); if ( (which == SSL3_CHANGE_CIPHER_CLIENT_WRITE) || (which == SSL3_CHANGE_CIPHER_SERVER_READ)) { @@ -474,10 +489,14 @@ int tls1_change_cipher_state(SSL *s, int which) } memcpy(mac_secret,ms,i); - mac_key = EVP_PKEY_new_mac_key(mac_type, NULL, - mac_secret,*mac_secret_size); - EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key); - EVP_PKEY_free(mac_key); + + if (!(EVP_CIPHER_flags(c)&EVP_CIPH_FLAG_AEAD_CIPHER)) + { + mac_key = EVP_PKEY_new_mac_key(mac_type, NULL, + mac_secret,*mac_secret_size); + EVP_DigestSignInit(mac_ctx,NULL,m,NULL,mac_key); + EVP_PKEY_free(mac_key); + } #ifdef TLS_DEBUG printf("which = %04X\nmac key=",which); { int z; for (z=0; zoptions & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS)) + if (!(s->options & SSL_OP_DONT_INSERT_EMPTY_FRAGMENTS) + && s->method->version <= TLS1_VERSION) { /* enable vulnerability countermeasure for CBC ciphers with * known-IV problem (http://www.openssl.org/~bodo/tls-cbc.txt) @@ -640,14 +672,14 @@ int tls1_enc(SSL *s, int send) SSL3_RECORD *rec; EVP_CIPHER_CTX *ds; unsigned long l; - int bs,i,ii,j,k,n=0; + int bs,i,ii,j,k,pad=0; const EVP_CIPHER *enc; if (send) { if (EVP_MD_CTX_md(s->write_hash)) { - n=EVP_MD_CTX_size(s->write_hash); + int n=EVP_MD_CTX_size(s->write_hash); OPENSSL_assert(n >= 0); } ds=s->enc_write_ctx; @@ -655,13 +687,34 @@ int tls1_enc(SSL *s, int send) if (s->enc_write_ctx == NULL) enc=NULL; else + { + int ivlen; enc=EVP_CIPHER_CTX_cipher(s->enc_write_ctx); + /* For TLSv1.1 and later explicit IV */ + if (s->version >= TLS1_1_VERSION + && EVP_CIPHER_mode(enc) == EVP_CIPH_CBC_MODE) + ivlen = EVP_CIPHER_iv_length(enc); + else + ivlen = 0; + if (ivlen > 1) + { + if ( rec->data != rec->input) + /* we can't write into the input stream: + * Can this ever happen?? (steve) + */ + fprintf(stderr, + "%s:%d: rec->data != rec->input\n", + __FILE__, __LINE__); + else if (RAND_bytes(rec->input, ivlen) <= 0) + return -1; + } + } } else { if (EVP_MD_CTX_md(s->read_hash)) { - n=EVP_MD_CTX_size(s->read_hash); + int n=EVP_MD_CTX_size(s->read_hash); OPENSSL_assert(n >= 0); } ds=s->enc_read_ctx; @@ -687,7 +740,43 @@ int tls1_enc(SSL *s, int send) l=rec->length; bs=EVP_CIPHER_block_size(ds->cipher); - if ((bs != 1) && send) + if (EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_AEAD_CIPHER) + { + unsigned char buf[13],*seq; + + seq = send?s->s3->write_sequence:s->s3->read_sequence; + + if (s->version == DTLS1_VERSION || s->version == DTLS1_BAD_VER) + { + unsigned char dtlsseq[9],*p=dtlsseq; + + s2n(send?s->d1->w_epoch:s->d1->r_epoch,p); + memcpy(p,&seq[2],6); + memcpy(buf,dtlsseq,8); + } + else + { + memcpy(buf,seq,8); + for (i=7; i>=0; i--) /* increment */ + { + ++seq[i]; + if (seq[i] != 0) break; + } + } + + buf[8]=rec->type; + buf[9]=(unsigned char)(s->version>>8); + buf[10]=(unsigned char)(s->version); + buf[11]=rec->length>>8; + buf[12]=rec->length&0xff; + pad=EVP_CIPHER_CTX_ctrl(ds,EVP_CTRL_AEAD_TLS1_AAD,13,buf); + if (send) + { + l+=pad; + rec->length+=pad; + } + } + else if ((bs != 1) && send) { i=bs-((int)l%bs); @@ -728,13 +817,25 @@ int tls1_enc(SSL *s, int send) { if (l == 0 || l%bs != 0) { + if (s->version >= TLS1_1_VERSION) + return -1; SSLerr(SSL_F_TLS1_ENC,SSL_R_BLOCK_CIPHER_PAD_IS_WRONG); ssl3_send_alert(s,SSL3_AL_FATAL,SSL_AD_DECRYPTION_FAILED); return 0; } } - EVP_Cipher(ds,rec->data,rec->input,l); + i = EVP_Cipher(ds,rec->data,rec->input,l); + if ((EVP_CIPHER_flags(ds->cipher)&EVP_CIPH_FLAG_CUSTOM_CIPHER) + ?(i<0) + :(i==0)) + return -1; /* AEAD can fail to verify MAC */ + if (EVP_CIPHER_mode(enc) == EVP_CIPH_GCM_MODE && !send) + { + rec->data += EVP_GCM_TLS_EXPLICIT_IV_LEN; + rec->input += EVP_GCM_TLS_EXPLICIT_IV_LEN; + rec->length -= EVP_GCM_TLS_EXPLICIT_IV_LEN; + } #ifdef KSSL_DEBUG { @@ -784,8 +885,19 @@ int tls1_enc(SSL *s, int send) return -1; } } - rec->length-=i; + rec->length -=i; + if (s->version >= TLS1_1_VERSION + && EVP_CIPHER_CTX_mode(ds) == EVP_CIPH_CBC_MODE) + { + if (bs > (int)rec->length) + return -1; + rec->data += bs; /* skip the explicit IV */ + rec->input += bs; + rec->length -= bs; + } } + if (pad && !send) + rec->length -= pad; } return(1); } @@ -841,7 +953,7 @@ int tls1_final_finish_mac(SSL *s, for (idx=0;ssl_get_handshake_digest(idx,&mask,&md);idx++) { - if (mask & s->s3->tmp.new_cipher->algorithm2) + if (mask & ssl_get_algorithm2(s)) { int hashsize = EVP_MD_size(md); if (hashsize < 0 || hashsize > (int)(sizeof buf - (size_t)(q-buf))) @@ -860,7 +972,7 @@ int tls1_final_finish_mac(SSL *s, } } - if (!tls1_PRF(s->s3->tmp.new_cipher->algorithm2, + if (!tls1_PRF(ssl_get_algorithm2(s), str,slen, buf,(int)(q-buf), NULL,0, NULL,0, NULL,0, s->session->master_key,s->session->master_key_length, out,buf2,sizeof buf2)) @@ -970,6 +1082,7 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p, const void *co = NULL, *so = NULL; int col = 0, sol = 0; + #ifdef KSSL_DEBUG printf ("tls1_generate_master_secret(%p,%p, %p, %d)\n", s,out, p,len); #endif /* KSSL_DEBUG */ @@ -986,7 +1099,7 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p, } #endif - tls1_PRF(s->s3->tmp.new_cipher->algorithm2, + tls1_PRF(ssl_get_algorithm2(s), TLS_MD_MASTER_SECRET_CONST,TLS_MD_MASTER_SECRET_CONST_SIZE, s->s3->client_random,SSL3_RANDOM_SIZE, co, col, @@ -994,6 +1107,16 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p, so, sol, p,len, s->session->master_key,buff,sizeof buff); +#ifdef SSL_DEBUG + fprintf(stderr, "Premaster Secret:\n"); + BIO_dump_fp(stderr, (char *)p, len); + fprintf(stderr, "Client Random:\n"); + BIO_dump_fp(stderr, (char *)s->s3->client_random, SSL3_RANDOM_SIZE); + fprintf(stderr, "Server Random:\n"); + BIO_dump_fp(stderr, (char *)s->s3->server_random, SSL3_RANDOM_SIZE); + fprintf(stderr, "Master Secret:\n"); + BIO_dump_fp(stderr, (char *)s->session->master_key, SSL3_MASTER_SECRET_SIZE); +#endif #ifdef KSSL_DEBUG printf ("tls1_generate_master_secret() complete\n"); @@ -1001,6 +1124,95 @@ int tls1_generate_master_secret(SSL *s, unsigned char *out, unsigned char *p, return(SSL3_MASTER_SECRET_SIZE); } +int tls1_export_keying_material(SSL *s, unsigned char *out, size_t olen, + const char *label, size_t llen, const unsigned char *context, + size_t contextlen, int use_context) + { + unsigned char *buff; + unsigned char *val = NULL; + size_t vallen, currentvalpos; + int rv; + +#ifdef KSSL_DEBUG + printf ("tls1_export_keying_material(%p,%p,%d,%s,%d,%p,%d)\n", s, out, olen, label, llen, p, plen); +#endif /* KSSL_DEBUG */ + + buff = OPENSSL_malloc(olen); + if (buff == NULL) goto err2; + + /* construct PRF arguments + * we construct the PRF argument ourself rather than passing separate + * values into the TLS PRF to ensure that the concatenation of values + * does not create a prohibited label. + */ + vallen = llen + SSL3_RANDOM_SIZE * 2; + if (use_context) + { + vallen += 2 + contextlen; + } + + val = OPENSSL_malloc(vallen); + if (val == NULL) goto err2; + currentvalpos = 0; + memcpy(val + currentvalpos, (unsigned char *) label, llen); + currentvalpos += llen; + memcpy(val + currentvalpos, s->s3->client_random, SSL3_RANDOM_SIZE); + currentvalpos += SSL3_RANDOM_SIZE; + memcpy(val + currentvalpos, s->s3->server_random, SSL3_RANDOM_SIZE); + currentvalpos += SSL3_RANDOM_SIZE; + + if (use_context) + { + val[currentvalpos] = (contextlen >> 8) & 0xff; + currentvalpos++; + val[currentvalpos] = contextlen & 0xff; + currentvalpos++; + if ((contextlen > 0) || (context != NULL)) + { + memcpy(val + currentvalpos, context, contextlen); + } + } + + /* disallow prohibited labels + * note that SSL3_RANDOM_SIZE > max(prohibited label len) = + * 15, so size of val > max(prohibited label len) = 15 and the + * comparisons won't have buffer overflow + */ + if (memcmp(val, TLS_MD_CLIENT_FINISH_CONST, + TLS_MD_CLIENT_FINISH_CONST_SIZE) == 0) goto err1; + if (memcmp(val, TLS_MD_SERVER_FINISH_CONST, + TLS_MD_SERVER_FINISH_CONST_SIZE) == 0) goto err1; + if (memcmp(val, TLS_MD_MASTER_SECRET_CONST, + TLS_MD_MASTER_SECRET_CONST_SIZE) == 0) goto err1; + if (memcmp(val, TLS_MD_KEY_EXPANSION_CONST, + TLS_MD_KEY_EXPANSION_CONST_SIZE) == 0) goto err1; + + rv = tls1_PRF(s->s3->tmp.new_cipher->algorithm2, + val, vallen, + NULL, 0, + NULL, 0, + NULL, 0, + NULL, 0, + s->session->master_key,s->session->master_key_length, + out,buff,olen); + +#ifdef KSSL_DEBUG + printf ("tls1_export_keying_material() complete\n"); +#endif /* KSSL_DEBUG */ + goto ret; +err1: + SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, SSL_R_TLS_ILLEGAL_EXPORTER_LABEL); + rv = 0; + goto ret; +err2: + SSLerr(SSL_F_TLS1_EXPORT_KEYING_MATERIAL, ERR_R_MALLOC_FAILURE); + rv = 0; +ret: + if (buff != NULL) OPENSSL_free(buff); + if (val != NULL) OPENSSL_free(val); + return(rv); + } + int tls1_alert_code(int code) { switch (code) @@ -1042,4 +1254,3 @@ int tls1_alert_code(int code) default: return(-1); } } - diff --git a/src/lib/libssl/t1_lib.c b/src/lib/libssl/t1_lib.c index 26cbae449e..27c8e3460d 100644 --- a/src/lib/libssl/t1_lib.c +++ b/src/lib/libssl/t1_lib.c @@ -114,6 +114,7 @@ #include #include #include +#include #include "ssl_locl.h" const char tls1_version_str[]="TLSv1" OPENSSL_VERSION_PTEXT; @@ -136,6 +137,7 @@ SSL3_ENC_METHOD TLSv1_enc_data={ TLS_MD_CLIENT_FINISH_CONST,TLS_MD_CLIENT_FINISH_CONST_SIZE, TLS_MD_SERVER_FINISH_CONST,TLS_MD_SERVER_FINISH_CONST_SIZE, tls1_alert_code, + tls1_export_keying_material, }; long tls1_default_timeout(void) @@ -166,10 +168,11 @@ void tls1_free(SSL *s) void tls1_clear(SSL *s) { ssl3_clear(s); - s->version=TLS1_VERSION; + s->version = s->method->version; } #ifndef OPENSSL_NO_EC + static int nid_list[] = { NID_sect163k1, /* sect163k1 (1) */ @@ -198,7 +201,36 @@ static int nid_list[] = NID_secp384r1, /* secp384r1 (24) */ NID_secp521r1 /* secp521r1 (25) */ }; - + +static int pref_list[] = + { + NID_sect571r1, /* sect571r1 (14) */ + NID_sect571k1, /* sect571k1 (13) */ + NID_secp521r1, /* secp521r1 (25) */ + NID_sect409k1, /* sect409k1 (11) */ + NID_sect409r1, /* sect409r1 (12) */ + NID_secp384r1, /* secp384r1 (24) */ + NID_sect283k1, /* sect283k1 (9) */ + NID_sect283r1, /* sect283r1 (10) */ + NID_secp256k1, /* secp256k1 (22) */ + NID_X9_62_prime256v1, /* secp256r1 (23) */ + NID_sect239k1, /* sect239k1 (8) */ + NID_sect233k1, /* sect233k1 (6) */ + NID_sect233r1, /* sect233r1 (7) */ + NID_secp224k1, /* secp224k1 (20) */ + NID_secp224r1, /* secp224r1 (21) */ + NID_sect193r1, /* sect193r1 (4) */ + NID_sect193r2, /* sect193r2 (5) */ + NID_secp192k1, /* secp192k1 (18) */ + NID_X9_62_prime192v1, /* secp192r1 (19) */ + NID_sect163k1, /* sect163k1 (1) */ + NID_sect163r1, /* sect163r1 (2) */ + NID_sect163r2, /* sect163r2 (3) */ + NID_secp160k1, /* secp160k1 (15) */ + NID_secp160r1, /* secp160r1 (16) */ + NID_secp160r2, /* secp160r2 (17) */ + }; + int tls1_ec_curve_id2nid(int curve_id) { /* ECC curves from draft-ietf-tls-ecc-12.txt (Oct. 17, 2005) */ @@ -270,6 +302,64 @@ int tls1_ec_nid2curve_id(int nid) #endif /* OPENSSL_NO_EC */ #ifndef OPENSSL_NO_TLSEXT + +/* List of supported signature algorithms and hashes. Should make this + * customisable at some point, for now include everything we support. + */ + +#ifdef OPENSSL_NO_RSA +#define tlsext_sigalg_rsa(md) /* */ +#else +#define tlsext_sigalg_rsa(md) md, TLSEXT_signature_rsa, +#endif + +#ifdef OPENSSL_NO_DSA +#define tlsext_sigalg_dsa(md) /* */ +#else +#define tlsext_sigalg_dsa(md) md, TLSEXT_signature_dsa, +#endif + +#ifdef OPENSSL_NO_ECDSA +#define tlsext_sigalg_ecdsa(md) /* */ +#else +#define tlsext_sigalg_ecdsa(md) md, TLSEXT_signature_ecdsa, +#endif + +#define tlsext_sigalg(md) \ + tlsext_sigalg_rsa(md) \ + tlsext_sigalg_dsa(md) \ + tlsext_sigalg_ecdsa(md) + +static unsigned char tls12_sigalgs[] = { +#ifndef OPENSSL_NO_SHA512 + tlsext_sigalg(TLSEXT_hash_sha512) + tlsext_sigalg(TLSEXT_hash_sha384) +#endif +#ifndef OPENSSL_NO_SHA256 + tlsext_sigalg(TLSEXT_hash_sha256) + tlsext_sigalg(TLSEXT_hash_sha224) +#endif +#ifndef OPENSSL_NO_SHA + tlsext_sigalg(TLSEXT_hash_sha1) +#endif +#ifndef OPENSSL_NO_MD5 + tlsext_sigalg_rsa(TLSEXT_hash_md5) +#endif +}; + +int tls12_get_req_sig_algs(SSL *s, unsigned char *p) + { + size_t slen = sizeof(tls12_sigalgs); +#ifdef OPENSSL_FIPS + /* If FIPS mode don't include MD5 which is last */ + if (FIPS_mode()) + slen -= 2; +#endif + if (p) + memcpy(p, tls12_sigalgs, slen); + return (int)slen; + } + unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned char *limit) { int extdatalen=0; @@ -317,7 +407,7 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha } /* Add RI if renegotiating */ - if (s->new_session) + if (s->renegotiate) { int el; @@ -341,6 +431,34 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha ret += el; } +#ifndef OPENSSL_NO_SRP + /* Add SRP username if there is one */ + if (s->srp_ctx.login != NULL) + { /* Add TLS extension SRP username to the Client Hello message */ + + int login_len = strlen(s->srp_ctx.login); + if (login_len > 255 || login_len == 0) + { + SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR); + return NULL; + } + + /* check for enough space. + 4 for the srp type type and entension length + 1 for the srp user identity + + srp user identity length + */ + if ((limit - ret - 5 - login_len) < 0) return NULL; + + /* fill in the extension */ + s2n(TLSEXT_TYPE_srp,ret); + s2n(login_len+1,ret); + (*ret++) = (unsigned char) login_len; + memcpy(ret, s->srp_ctx.login, login_len); + ret+=login_len; + } +#endif + #ifndef OPENSSL_NO_EC if (s->tlsext_ecpointformatlist != NULL && s->version != DTLS1_VERSION) @@ -426,6 +544,17 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha } skip_ext: + if (TLS1_get_client_version(s) >= TLS1_2_VERSION) + { + if ((size_t)(limit - ret) < sizeof(tls12_sigalgs) + 6) + return NULL; + s2n(TLSEXT_TYPE_signature_algorithms,ret); + s2n(sizeof(tls12_sigalgs) + 2, ret); + s2n(sizeof(tls12_sigalgs), ret); + memcpy(ret, tls12_sigalgs, sizeof(tls12_sigalgs)); + ret += sizeof(tls12_sigalgs); + } + #ifdef TLSEXT_TYPE_opaque_prf_input if (s->s3->client_opaque_prf_input != NULL && s->version != DTLS1_VERSION) @@ -494,6 +623,51 @@ unsigned char *ssl_add_clienthello_tlsext(SSL *s, unsigned char *p, unsigned cha i2d_X509_EXTENSIONS(s->tlsext_ocsp_exts, &ret); } +#ifndef OPENSSL_NO_HEARTBEATS + /* Add Heartbeat extension */ + s2n(TLSEXT_TYPE_heartbeat,ret); + s2n(1,ret); + /* Set mode: + * 1: peer may send requests + * 2: peer not allowed to send requests + */ + if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS) + *(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS; + else + *(ret++) = SSL_TLSEXT_HB_ENABLED; +#endif + +#ifndef OPENSSL_NO_NEXTPROTONEG + if (s->ctx->next_proto_select_cb && !s->s3->tmp.finish_md_len) + { + /* The client advertises an emtpy extension to indicate its + * support for Next Protocol Negotiation */ + if (limit - ret - 4 < 0) + return NULL; + s2n(TLSEXT_TYPE_next_proto_neg,ret); + s2n(0,ret); + } +#endif + + if(SSL_get_srtp_profiles(s)) + { + int el; + + ssl_add_clienthello_use_srtp_ext(s, 0, &el, 0); + + if((limit - p - 4 - el) < 0) return NULL; + + s2n(TLSEXT_TYPE_use_srtp,ret); + s2n(el,ret); + + if(ssl_add_clienthello_use_srtp_ext(s, ret, &el, el)) + { + SSLerr(SSL_F_SSL_ADD_CLIENTHELLO_TLSEXT, ERR_R_INTERNAL_ERROR); + return NULL; + } + ret += el; + } + if ((extdatalen = ret-p-2)== 0) return p; @@ -505,6 +679,9 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha { int extdatalen=0; unsigned char *ret = p; +#ifndef OPENSSL_NO_NEXTPROTONEG + int next_proto_neg_seen; +#endif /* don't add extensions for SSLv3, unless doing secure renegotiation */ if (s->version == SSL3_VERSION && !s->s3->send_connection_binding) @@ -603,6 +780,26 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha ret += sol; } #endif + + if(s->srtp_profile) + { + int el; + + ssl_add_serverhello_use_srtp_ext(s, 0, &el, 0); + + if((limit - p - 4 - el) < 0) return NULL; + + s2n(TLSEXT_TYPE_use_srtp,ret); + s2n(el,ret); + + if(ssl_add_serverhello_use_srtp_ext(s, ret, &el, el)) + { + SSLerr(SSL_F_SSL_ADD_SERVERHELLO_TLSEXT, ERR_R_INTERNAL_ERROR); + return NULL; + } + ret+=el; + } + if (((s->s3->tmp.new_cipher->id & 0xFFFF)==0x80 || (s->s3->tmp.new_cipher->id & 0xFFFF)==0x81) && (SSL_get_options(s) & SSL_OP_CRYPTOPRO_TLSEXT_BUG)) { const unsigned char cryptopro_ext[36] = { @@ -618,6 +815,46 @@ unsigned char *ssl_add_serverhello_tlsext(SSL *s, unsigned char *p, unsigned cha } +#ifndef OPENSSL_NO_HEARTBEATS + /* Add Heartbeat extension if we've received one */ + if (s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) + { + s2n(TLSEXT_TYPE_heartbeat,ret); + s2n(1,ret); + /* Set mode: + * 1: peer may send requests + * 2: peer not allowed to send requests + */ + if (s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_RECV_REQUESTS) + *(ret++) = SSL_TLSEXT_HB_DONT_SEND_REQUESTS; + else + *(ret++) = SSL_TLSEXT_HB_ENABLED; + + } +#endif + +#ifndef OPENSSL_NO_NEXTPROTONEG + next_proto_neg_seen = s->s3->next_proto_neg_seen; + s->s3->next_proto_neg_seen = 0; + if (next_proto_neg_seen && s->ctx->next_protos_advertised_cb) + { + const unsigned char *npa; + unsigned int npalen; + int r; + + r = s->ctx->next_protos_advertised_cb(s, &npa, &npalen, s->ctx->next_protos_advertised_cb_arg); + if (r == SSL_TLSEXT_ERR_OK) + { + if ((long)(limit - ret - 4 - npalen) < 0) return NULL; + s2n(TLSEXT_TYPE_next_proto_neg,ret); + s2n(npalen,ret); + memcpy(ret, npa, npalen); + ret += npalen; + s->s3->next_proto_neg_seen = 1; + } + } +#endif + if ((extdatalen = ret-p-2)== 0) return p; @@ -632,9 +869,18 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in unsigned short len; unsigned char *data = *p; int renegotiate_seen = 0; + int sigalg_seen = 0; s->servername_done = 0; s->tlsext_status_type = -1; +#ifndef OPENSSL_NO_NEXTPROTONEG + s->s3->next_proto_neg_seen = 0; +#endif + +#ifndef OPENSSL_NO_HEARTBEATS + s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED | + SSL_TLSEXT_HB_DONT_SEND_REQUESTS); +#endif if (data >= (d+n-2)) goto ri_check; @@ -762,6 +1008,31 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in } } +#ifndef OPENSSL_NO_SRP + else if (type == TLSEXT_TYPE_srp) + { + if (size <= 0 || ((len = data[0])) != (size -1)) + { + *al = SSL_AD_DECODE_ERROR; + return 0; + } + if (s->srp_ctx.login != NULL) + { + *al = SSL_AD_DECODE_ERROR; + return 0; + } + if ((s->srp_ctx.login = OPENSSL_malloc(len+1)) == NULL) + return -1; + memcpy(s->srp_ctx.login, &data[1], len); + s->srp_ctx.login[len]='\0'; + + if (strlen(s->srp_ctx.login) != len) + { + *al = SSL_AD_DECODE_ERROR; + return 0; + } + } +#endif #ifndef OPENSSL_NO_EC else if (type == TLSEXT_TYPE_ec_point_formats && @@ -882,6 +1153,28 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in return 0; renegotiate_seen = 1; } + else if (type == TLSEXT_TYPE_signature_algorithms) + { + int dsize; + if (sigalg_seen || size < 2) + { + *al = SSL_AD_DECODE_ERROR; + return 0; + } + sigalg_seen = 1; + n2s(data,dsize); + size -= 2; + if (dsize != size || dsize & 1) + { + *al = SSL_AD_DECODE_ERROR; + return 0; + } + if (!tls1_process_sigalgs(s, data, dsize)) + { + *al = SSL_AD_DECODE_ERROR; + return 0; + } + } else if (type == TLSEXT_TYPE_status_request && s->version != DTLS1_VERSION && s->ctx->tlsext_status_cb) { @@ -994,8 +1287,54 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in else s->tlsext_status_type = -1; } +#ifndef OPENSSL_NO_HEARTBEATS + else if (type == TLSEXT_TYPE_heartbeat) + { + switch(data[0]) + { + case 0x01: /* Client allows us to send HB requests */ + s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED; + break; + case 0x02: /* Client doesn't accept HB requests */ + s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED; + s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS; + break; + default: *al = SSL_AD_ILLEGAL_PARAMETER; + return 0; + } + } +#endif +#ifndef OPENSSL_NO_NEXTPROTONEG + else if (type == TLSEXT_TYPE_next_proto_neg && + s->s3->tmp.finish_md_len == 0) + { + /* We shouldn't accept this extension on a + * renegotiation. + * + * s->new_session will be set on renegotiation, but we + * probably shouldn't rely that it couldn't be set on + * the initial renegotation too in certain cases (when + * there's some other reason to disallow resuming an + * earlier session -- the current code won't be doing + * anything like that, but this might change). + + * A valid sign that there's been a previous handshake + * in this connection is if s->s3->tmp.finish_md_len > + * 0. (We are talking about a check that will happen + * in the Hello protocol round, well before a new + * Finished message could have been computed.) */ + s->s3->next_proto_neg_seen = 1; + } +#endif /* session ticket processed earlier */ + else if (type == TLSEXT_TYPE_use_srtp) + { + if(ssl_parse_clienthello_use_srtp_ext(s, data, size, + al)) + return 0; + } + data+=size; } @@ -1005,7 +1344,7 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in /* Need RI if renegotiating */ - if (!renegotiate_seen && s->new_session && + if (!renegotiate_seen && s->renegotiate && !(s->options & SSL_OP_ALLOW_UNSAFE_LEGACY_RENEGOTIATION)) { *al = SSL_AD_HANDSHAKE_FAILURE; @@ -1017,6 +1356,26 @@ int ssl_parse_clienthello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in return 1; } +#ifndef OPENSSL_NO_NEXTPROTONEG +/* ssl_next_proto_validate validates a Next Protocol Negotiation block. No + * elements of zero length are allowed and the set of elements must exactly fill + * the length of the block. */ +static char ssl_next_proto_validate(unsigned char *d, unsigned len) + { + unsigned int off = 0; + + while (off < len) + { + if (d[off] == 0) + return 0; + off += d[off]; + off++; + } + + return off == len; + } +#endif + int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, int n, int *al) { unsigned short length; @@ -1026,6 +1385,15 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in int tlsext_servername = 0; int renegotiate_seen = 0; +#ifndef OPENSSL_NO_NEXTPROTONEG + s->s3->next_proto_neg_seen = 0; +#endif + +#ifndef OPENSSL_NO_HEARTBEATS + s->tlsext_heartbeat &= ~(SSL_TLSEXT_HB_ENABLED | + SSL_TLSEXT_HB_DONT_SEND_REQUESTS); +#endif + if (data >= (d+n-2)) goto ri_check; @@ -1151,12 +1519,71 @@ int ssl_parse_serverhello_tlsext(SSL *s, unsigned char **p, unsigned char *d, in /* Set flag to expect CertificateStatus message */ s->tlsext_status_expected = 1; } +#ifndef OPENSSL_NO_NEXTPROTONEG + else if (type == TLSEXT_TYPE_next_proto_neg && + s->s3->tmp.finish_md_len == 0) + { + unsigned char *selected; + unsigned char selected_len; + + /* We must have requested it. */ + if ((s->ctx->next_proto_select_cb == NULL)) + { + *al = TLS1_AD_UNSUPPORTED_EXTENSION; + return 0; + } + /* The data must be valid */ + if (!ssl_next_proto_validate(data, size)) + { + *al = TLS1_AD_DECODE_ERROR; + return 0; + } + if (s->ctx->next_proto_select_cb(s, &selected, &selected_len, data, size, s->ctx->next_proto_select_cb_arg) != SSL_TLSEXT_ERR_OK) + { + *al = TLS1_AD_INTERNAL_ERROR; + return 0; + } + s->next_proto_negotiated = OPENSSL_malloc(selected_len); + if (!s->next_proto_negotiated) + { + *al = TLS1_AD_INTERNAL_ERROR; + return 0; + } + memcpy(s->next_proto_negotiated, selected, selected_len); + s->next_proto_negotiated_len = selected_len; + s->s3->next_proto_neg_seen = 1; + } +#endif else if (type == TLSEXT_TYPE_renegotiate) { if(!ssl_parse_serverhello_renegotiate_ext(s, data, size, al)) return 0; renegotiate_seen = 1; } +#ifndef OPENSSL_NO_HEARTBEATS + else if (type == TLSEXT_TYPE_heartbeat) + { + switch(data[0]) + { + case 0x01: /* Server allows us to send HB requests */ + s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED; + break; + case 0x02: /* Server doesn't accept HB requests */ + s->tlsext_heartbeat |= SSL_TLSEXT_HB_ENABLED; + s->tlsext_heartbeat |= SSL_TLSEXT_HB_DONT_SEND_REQUESTS; + break; + default: *al = SSL_AD_ILLEGAL_PARAMETER; + return 0; + } + } +#endif + else if (type == TLSEXT_TYPE_use_srtp) + { + if(ssl_parse_serverhello_use_srtp_ext(s, data, size, + al)) + return 0; + } + data+=size; } @@ -1236,7 +1663,7 @@ int ssl_prepare_clienthello_tlsext(SSL *s) break; } } - using_ecc = using_ecc && (s->version == TLS1_VERSION); + using_ecc = using_ecc && (s->version >= TLS1_VERSION); if (using_ecc) { if (s->tlsext_ecpointformatlist != NULL) OPENSSL_free(s->tlsext_ecpointformatlist); @@ -1252,16 +1679,19 @@ int ssl_prepare_clienthello_tlsext(SSL *s) /* we support all named elliptic curves in draft-ietf-tls-ecc-12 */ if (s->tlsext_ellipticcurvelist != NULL) OPENSSL_free(s->tlsext_ellipticcurvelist); - s->tlsext_ellipticcurvelist_length = sizeof(nid_list)/sizeof(nid_list[0]) * 2; + s->tlsext_ellipticcurvelist_length = sizeof(pref_list)/sizeof(pref_list[0]) * 2; if ((s->tlsext_ellipticcurvelist = OPENSSL_malloc(s->tlsext_ellipticcurvelist_length)) == NULL) { s->tlsext_ellipticcurvelist_length = 0; SSLerr(SSL_F_SSL_PREPARE_CLIENTHELLO_TLSEXT,ERR_R_MALLOC_FAILURE); return -1; } - for (i = 1, j = s->tlsext_ellipticcurvelist; (unsigned int)i <= - sizeof(nid_list)/sizeof(nid_list[0]); i++) - s2n(i,j); + for (i = 0, j = s->tlsext_ellipticcurvelist; (unsigned int)i < + sizeof(pref_list)/sizeof(pref_list[0]); i++) + { + int id = tls1_ec_nid2curve_id(pref_list[i]); + s2n(id,j); + } } #endif /* OPENSSL_NO_EC */ @@ -1570,26 +2000,56 @@ int ssl_check_serverhello_tlsext(SSL *s) } } -/* Since the server cache lookup is done early on in the processing of client - * hello and other operations depend on the result we need to handle any TLS - * session ticket extension at the same time. +/* Since the server cache lookup is done early on in the processing of the + * ClientHello, and other operations depend on the result, we need to handle + * any TLS session ticket extension at the same time. + * + * session_id: points at the session ID in the ClientHello. This code will + * read past the end of this in order to parse out the session ticket + * extension, if any. + * len: the length of the session ID. + * limit: a pointer to the first byte after the ClientHello. + * ret: (output) on return, if a ticket was decrypted, then this is set to + * point to the resulting session. + * + * If s->tls_session_secret_cb is set then we are expecting a pre-shared key + * ciphersuite, in which case we have no use for session tickets and one will + * never be decrypted, nor will s->tlsext_ticket_expected be set to 1. + * + * Returns: + * -1: fatal error, either from parsing or decrypting the ticket. + * 0: no ticket was found (or was ignored, based on settings). + * 1: a zero length extension was found, indicating that the client supports + * session tickets but doesn't currently have one to offer. + * 2: either s->tls_session_secret_cb was set, or a ticket was offered but + * couldn't be decrypted because of a non-fatal error. + * 3: a ticket was successfully decrypted and *ret was set. + * + * Side effects: + * Sets s->tlsext_ticket_expected to 1 if the server will have to issue + * a new session ticket to the client because the client indicated support + * (and s->tls_session_secret_cb is NULL) but the client either doesn't have + * a session ticket or we couldn't use the one it gave us, or if + * s->ctx->tlsext_ticket_key_cb asked to renew the client's ticket. + * Otherwise, s->tlsext_ticket_expected is set to 0. */ - int tls1_process_ticket(SSL *s, unsigned char *session_id, int len, - const unsigned char *limit, SSL_SESSION **ret) + const unsigned char *limit, SSL_SESSION **ret) { /* Point after session ID in client hello */ const unsigned char *p = session_id + len; unsigned short i; + *ret = NULL; + s->tlsext_ticket_expected = 0; + /* If tickets disabled behave as if no ticket present - * to permit stateful resumption. - */ + * to permit stateful resumption. + */ if (SSL_get_options(s) & SSL_OP_NO_TICKET) - return 1; - + return 0; if ((s->version <= SSL3_VERSION) || !limit) - return 1; + return 0; if (p >= limit) return -1; /* Skip past DTLS cookie */ @@ -1612,7 +2072,7 @@ int tls1_process_ticket(SSL *s, unsigned char *session_id, int len, return -1; /* Now at start of extensions */ if ((p + 2) >= limit) - return 1; + return 0; n2s(p, i); while ((p + 4) <= limit) { @@ -1620,39 +2080,61 @@ int tls1_process_ticket(SSL *s, unsigned char *session_id, int len, n2s(p, type); n2s(p, size); if (p + size > limit) - return 1; + return 0; if (type == TLSEXT_TYPE_session_ticket) { - /* If tickets disabled indicate cache miss which will - * trigger a full handshake - */ - if (SSL_get_options(s) & SSL_OP_NO_TICKET) - return 1; - /* If zero length note client will accept a ticket - * and indicate cache miss to trigger full handshake - */ + int r; if (size == 0) { + /* The client will accept a ticket but doesn't + * currently have one. */ s->tlsext_ticket_expected = 1; - return 0; /* Cache miss */ + return 1; } if (s->tls_session_secret_cb) { - /* Indicate cache miss here and instead of - * generating the session from ticket now, - * trigger abbreviated handshake based on - * external mechanism to calculate the master - * secret later. */ - return 0; + /* Indicate that the ticket couldn't be + * decrypted rather than generating the session + * from ticket now, trigger abbreviated + * handshake based on external mechanism to + * calculate the master secret later. */ + return 2; + } + r = tls_decrypt_ticket(s, p, size, session_id, len, ret); + switch (r) + { + case 2: /* ticket couldn't be decrypted */ + s->tlsext_ticket_expected = 1; + return 2; + case 3: /* ticket was decrypted */ + return r; + case 4: /* ticket decrypted but need to renew */ + s->tlsext_ticket_expected = 1; + return 3; + default: /* fatal error */ + return -1; } - return tls_decrypt_ticket(s, p, size, session_id, len, - ret); } p += size; } - return 1; + return 0; } +/* tls_decrypt_ticket attempts to decrypt a session ticket. + * + * etick: points to the body of the session ticket extension. + * eticklen: the length of the session tickets extenion. + * sess_id: points at the session ID. + * sesslen: the length of the session ID. + * psess: (output) on return, if a ticket was decrypted, then this is set to + * point to the resulting session. + * + * Returns: + * -1: fatal error, either from parsing or decrypting the ticket. + * 2: the ticket couldn't be decrypted. + * 3: a ticket was successfully decrypted and *psess was set. + * 4: same as 3, but the ticket needs to be renewed. + */ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen, const unsigned char *sess_id, int sesslen, SSL_SESSION **psess) @@ -1667,7 +2149,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen, SSL_CTX *tctx = s->initial_ctx; /* Need at least keyname + iv + some encrypted data */ if (eticklen < 48) - goto tickerr; + return 2; /* Initialize session ticket encryption and HMAC contexts */ HMAC_CTX_init(&hctx); EVP_CIPHER_CTX_init(&ctx); @@ -1679,7 +2161,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen, if (rv < 0) return -1; if (rv == 0) - goto tickerr; + return 2; if (rv == 2) renew_ticket = 1; } @@ -1687,15 +2169,15 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen, { /* Check key name matches */ if (memcmp(etick, tctx->tlsext_tick_key_name, 16)) - goto tickerr; + return 2; HMAC_Init_ex(&hctx, tctx->tlsext_tick_hmac_key, 16, tlsext_tick_md(), NULL); EVP_DecryptInit_ex(&ctx, EVP_aes_128_cbc(), NULL, tctx->tlsext_tick_aes_key, etick + 16); } /* Attempt to process session ticket, first conduct sanity and - * integrity checks on ticket. - */ + * integrity checks on ticket. + */ mlen = HMAC_size(&hctx); if (mlen < 0) { @@ -1708,7 +2190,7 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen, HMAC_Final(&hctx, tick_hmac, NULL); HMAC_CTX_cleanup(&hctx); if (memcmp(tick_hmac, etick + eticklen, mlen)) - goto tickerr; + return 2; /* Attempt to decrypt session data */ /* Move p after IV to start of encrypted ticket, update length */ p = etick + 16 + EVP_CIPHER_CTX_iv_length(&ctx); @@ -1721,33 +2203,376 @@ static int tls_decrypt_ticket(SSL *s, const unsigned char *etick, int eticklen, } EVP_DecryptUpdate(&ctx, sdec, &slen, p, eticklen); if (EVP_DecryptFinal(&ctx, sdec + slen, &mlen) <= 0) - goto tickerr; + return 2; slen += mlen; EVP_CIPHER_CTX_cleanup(&ctx); p = sdec; - + sess = d2i_SSL_SESSION(NULL, &p, slen); OPENSSL_free(sdec); if (sess) { - /* The session ID if non-empty is used by some clients to - * detect that the ticket has been accepted. So we copy it to - * the session structure. If it is empty set length to zero - * as required by standard. - */ + /* The session ID, if non-empty, is used by some clients to + * detect that the ticket has been accepted. So we copy it to + * the session structure. If it is empty set length to zero + * as required by standard. + */ if (sesslen) memcpy(sess->session_id, sess_id, sesslen); sess->session_id_length = sesslen; *psess = sess; - s->tlsext_ticket_expected = renew_ticket; + if (renew_ticket) + return 4; + else + return 3; + } + ERR_clear_error(); + /* For session parse failure, indicate that we need to send a new + * ticket. */ + return 2; + } + +/* Tables to translate from NIDs to TLS v1.2 ids */ + +typedef struct + { + int nid; + int id; + } tls12_lookup; + +static tls12_lookup tls12_md[] = { +#ifndef OPENSSL_NO_MD5 + {NID_md5, TLSEXT_hash_md5}, +#endif +#ifndef OPENSSL_NO_SHA + {NID_sha1, TLSEXT_hash_sha1}, +#endif +#ifndef OPENSSL_NO_SHA256 + {NID_sha224, TLSEXT_hash_sha224}, + {NID_sha256, TLSEXT_hash_sha256}, +#endif +#ifndef OPENSSL_NO_SHA512 + {NID_sha384, TLSEXT_hash_sha384}, + {NID_sha512, TLSEXT_hash_sha512} +#endif +}; + +static tls12_lookup tls12_sig[] = { +#ifndef OPENSSL_NO_RSA + {EVP_PKEY_RSA, TLSEXT_signature_rsa}, +#endif +#ifndef OPENSSL_NO_DSA + {EVP_PKEY_DSA, TLSEXT_signature_dsa}, +#endif +#ifndef OPENSSL_NO_ECDSA + {EVP_PKEY_EC, TLSEXT_signature_ecdsa} +#endif +}; + +static int tls12_find_id(int nid, tls12_lookup *table, size_t tlen) + { + size_t i; + for (i = 0; i < tlen; i++) + { + if (table[i].nid == nid) + return table[i].id; + } + return -1; + } +#if 0 +static int tls12_find_nid(int id, tls12_lookup *table, size_t tlen) + { + size_t i; + for (i = 0; i < tlen; i++) + { + if (table[i].id == id) + return table[i].nid; + } + return -1; + } +#endif + +int tls12_get_sigandhash(unsigned char *p, const EVP_PKEY *pk, const EVP_MD *md) + { + int sig_id, md_id; + if (!md) + return 0; + md_id = tls12_find_id(EVP_MD_type(md), tls12_md, + sizeof(tls12_md)/sizeof(tls12_lookup)); + if (md_id == -1) + return 0; + sig_id = tls12_get_sigid(pk); + if (sig_id == -1) + return 0; + p[0] = (unsigned char)md_id; + p[1] = (unsigned char)sig_id; + return 1; + } + +int tls12_get_sigid(const EVP_PKEY *pk) + { + return tls12_find_id(pk->type, tls12_sig, + sizeof(tls12_sig)/sizeof(tls12_lookup)); + } + +const EVP_MD *tls12_get_hash(unsigned char hash_alg) + { + switch(hash_alg) + { +#ifndef OPENSSL_NO_MD5 + case TLSEXT_hash_md5: +#ifdef OPENSSL_FIPS + if (FIPS_mode()) + return NULL; +#endif + return EVP_md5(); +#endif +#ifndef OPENSSL_NO_SHA + case TLSEXT_hash_sha1: + return EVP_sha1(); +#endif +#ifndef OPENSSL_NO_SHA256 + case TLSEXT_hash_sha224: + return EVP_sha224(); + + case TLSEXT_hash_sha256: + return EVP_sha256(); +#endif +#ifndef OPENSSL_NO_SHA512 + case TLSEXT_hash_sha384: + return EVP_sha384(); + + case TLSEXT_hash_sha512: + return EVP_sha512(); +#endif + default: + return NULL; + + } + } + +/* Set preferred digest for each key type */ + +int tls1_process_sigalgs(SSL *s, const unsigned char *data, int dsize) + { + int i, idx; + const EVP_MD *md; + CERT *c = s->cert; + /* Extension ignored for TLS versions below 1.2 */ + if (TLS1_get_version(s) < TLS1_2_VERSION) return 1; + /* Should never happen */ + if (!c) + return 0; + + c->pkeys[SSL_PKEY_DSA_SIGN].digest = NULL; + c->pkeys[SSL_PKEY_RSA_SIGN].digest = NULL; + c->pkeys[SSL_PKEY_RSA_ENC].digest = NULL; + c->pkeys[SSL_PKEY_ECC].digest = NULL; + + for (i = 0; i < dsize; i += 2) + { + unsigned char hash_alg = data[i], sig_alg = data[i+1]; + + switch(sig_alg) + { +#ifndef OPENSSL_NO_RSA + case TLSEXT_signature_rsa: + idx = SSL_PKEY_RSA_SIGN; + break; +#endif +#ifndef OPENSSL_NO_DSA + case TLSEXT_signature_dsa: + idx = SSL_PKEY_DSA_SIGN; + break; +#endif +#ifndef OPENSSL_NO_ECDSA + case TLSEXT_signature_ecdsa: + idx = SSL_PKEY_ECC; + break; +#endif + default: + continue; + } + + if (c->pkeys[idx].digest == NULL) + { + md = tls12_get_hash(hash_alg); + if (md) + { + c->pkeys[idx].digest = md; + if (idx == SSL_PKEY_RSA_SIGN) + c->pkeys[SSL_PKEY_RSA_ENC].digest = md; + } + } + } - /* If session decrypt failure indicate a cache miss and set state to - * send a new ticket - */ - tickerr: - s->tlsext_ticket_expected = 1; + + + /* Set any remaining keys to default values. NOTE: if alg is not + * supported it stays as NULL. + */ +#ifndef OPENSSL_NO_DSA + if (!c->pkeys[SSL_PKEY_DSA_SIGN].digest) + c->pkeys[SSL_PKEY_DSA_SIGN].digest = EVP_dss1(); +#endif +#ifndef OPENSSL_NO_RSA + if (!c->pkeys[SSL_PKEY_RSA_SIGN].digest) + { + c->pkeys[SSL_PKEY_RSA_SIGN].digest = EVP_sha1(); + c->pkeys[SSL_PKEY_RSA_ENC].digest = EVP_sha1(); + } +#endif +#ifndef OPENSSL_NO_ECDSA + if (!c->pkeys[SSL_PKEY_ECC].digest) + c->pkeys[SSL_PKEY_ECC].digest = EVP_ecdsa(); +#endif + return 1; + } + +#endif + +#ifndef OPENSSL_NO_HEARTBEATS +int +tls1_process_heartbeat(SSL *s) + { + unsigned char *p = &s->s3->rrec.data[0], *pl; + unsigned short hbtype; + unsigned int payload; + unsigned int padding = 16; /* Use minimum padding */ + + /* Read type and payload length first */ + hbtype = *p++; + n2s(p, payload); + pl = p; + + if (s->msg_callback) + s->msg_callback(0, s->version, TLS1_RT_HEARTBEAT, + &s->s3->rrec.data[0], s->s3->rrec.length, + s, s->msg_callback_arg); + + if (hbtype == TLS1_HB_REQUEST) + { + unsigned char *buffer, *bp; + int r; + + /* Allocate memory for the response, size is 1 bytes + * message type, plus 2 bytes payload length, plus + * payload, plus padding + */ + buffer = OPENSSL_malloc(1 + 2 + payload + padding); + bp = buffer; + + /* Enter response type, length and copy payload */ + *bp++ = TLS1_HB_RESPONSE; + s2n(payload, bp); + memcpy(bp, pl, payload); + bp += payload; + /* Random padding */ + RAND_pseudo_bytes(bp, padding); + + r = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buffer, 3 + payload + padding); + + if (r >= 0 && s->msg_callback) + s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, + buffer, 3 + payload + padding, + s, s->msg_callback_arg); + + OPENSSL_free(buffer); + + if (r < 0) + return r; + } + else if (hbtype == TLS1_HB_RESPONSE) + { + unsigned int seq; + + /* We only send sequence numbers (2 bytes unsigned int), + * and 16 random bytes, so we just try to read the + * sequence number */ + n2s(pl, seq); + + if (payload == 18 && seq == s->tlsext_hb_seq) + { + s->tlsext_hb_seq++; + s->tlsext_hb_pending = 0; + } + } + return 0; } +int +tls1_heartbeat(SSL *s) + { + unsigned char *buf, *p; + int ret; + unsigned int payload = 18; /* Sequence number + random bytes */ + unsigned int padding = 16; /* Use minimum padding */ + + /* Only send if peer supports and accepts HB requests... */ + if (!(s->tlsext_heartbeat & SSL_TLSEXT_HB_ENABLED) || + s->tlsext_heartbeat & SSL_TLSEXT_HB_DONT_SEND_REQUESTS) + { + SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PEER_DOESNT_ACCEPT); + return -1; + } + + /* ...and there is none in flight yet... */ + if (s->tlsext_hb_pending) + { + SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_TLS_HEARTBEAT_PENDING); + return -1; + } + + /* ...and no handshake in progress. */ + if (SSL_in_init(s) || s->in_handshake) + { + SSLerr(SSL_F_TLS1_HEARTBEAT,SSL_R_UNEXPECTED_MESSAGE); + return -1; + } + + /* Check if padding is too long, payload and padding + * must not exceed 2^14 - 3 = 16381 bytes in total. + */ + OPENSSL_assert(payload + padding <= 16381); + + /* Create HeartBeat message, we just use a sequence number + * as payload to distuingish different messages and add + * some random stuff. + * - Message Type, 1 byte + * - Payload Length, 2 bytes (unsigned int) + * - Payload, the sequence number (2 bytes uint) + * - Payload, random bytes (16 bytes uint) + * - Padding + */ + buf = OPENSSL_malloc(1 + 2 + payload + padding); + p = buf; + /* Message Type */ + *p++ = TLS1_HB_REQUEST; + /* Payload length (18 bytes here) */ + s2n(payload, p); + /* Sequence number */ + s2n(s->tlsext_hb_seq, p); + /* 16 random bytes */ + RAND_pseudo_bytes(p, 16); + p += 16; + /* Random padding */ + RAND_pseudo_bytes(p, padding); + + ret = ssl3_write_bytes(s, TLS1_RT_HEARTBEAT, buf, 3 + payload + padding); + if (ret >= 0) + { + if (s->msg_callback) + s->msg_callback(1, s->version, TLS1_RT_HEARTBEAT, + buf, 3 + payload + padding, + s, s->msg_callback_arg); + + s->tlsext_hb_pending = 1; + } + + OPENSSL_free(buf); + + return ret; + } #endif diff --git a/src/lib/libssl/t1_meth.c b/src/lib/libssl/t1_meth.c index 6ce7c0bbf5..53c807de28 100644 --- a/src/lib/libssl/t1_meth.c +++ b/src/lib/libssl/t1_meth.c @@ -60,16 +60,28 @@ #include #include "ssl_locl.h" -static const SSL_METHOD *tls1_get_method(int ver); static const SSL_METHOD *tls1_get_method(int ver) { + if (ver == TLS1_2_VERSION) + return TLSv1_2_method(); + if (ver == TLS1_1_VERSION) + return TLSv1_1_method(); if (ver == TLS1_VERSION) - return(TLSv1_method()); - else - return(NULL); + return TLSv1_method(); + return NULL; } -IMPLEMENT_tls1_meth_func(TLSv1_method, +IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_method, + ssl3_accept, + ssl3_connect, + tls1_get_method) + +IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_method, + ssl3_accept, + ssl3_connect, + tls1_get_method) + +IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_method, ssl3_accept, ssl3_connect, tls1_get_method) diff --git a/src/lib/libssl/t1_srvr.c b/src/lib/libssl/t1_srvr.c index 42525e9e89..f1d1565769 100644 --- a/src/lib/libssl/t1_srvr.c +++ b/src/lib/libssl/t1_srvr.c @@ -67,13 +67,26 @@ static const SSL_METHOD *tls1_get_server_method(int ver); static const SSL_METHOD *tls1_get_server_method(int ver) { + if (ver == TLS1_2_VERSION) + return TLSv1_2_server_method(); + if (ver == TLS1_1_VERSION) + return TLSv1_1_server_method(); if (ver == TLS1_VERSION) - return(TLSv1_server_method()); - else - return(NULL); + return TLSv1_server_method(); + return NULL; } -IMPLEMENT_tls1_meth_func(TLSv1_server_method, +IMPLEMENT_tls_meth_func(TLS1_2_VERSION, TLSv1_2_server_method, + ssl3_accept, + ssl_undefined_function, + tls1_get_server_method) + +IMPLEMENT_tls_meth_func(TLS1_1_VERSION, TLSv1_1_server_method, + ssl3_accept, + ssl_undefined_function, + tls1_get_server_method) + +IMPLEMENT_tls_meth_func(TLS1_VERSION, TLSv1_server_method, ssl3_accept, ssl_undefined_function, tls1_get_server_method) diff --git a/src/lib/libssl/test/CAss.cnf b/src/lib/libssl/test/CAss.cnf index 20f8f05e3d..109bc8c10b 100644 --- a/src/lib/libssl/test/CAss.cnf +++ b/src/lib/libssl/test/CAss.cnf @@ -7,7 +7,7 @@ RANDFILE = ./.rnd #################################################################### [ req ] -default_bits = 512 +default_bits = 2048 default_keyfile = keySS.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no diff --git a/src/lib/libssl/test/P1ss.cnf b/src/lib/libssl/test/P1ss.cnf index 876a0d35f8..326cce2ba8 100644 --- a/src/lib/libssl/test/P1ss.cnf +++ b/src/lib/libssl/test/P1ss.cnf @@ -7,7 +7,7 @@ RANDFILE = ./.rnd #################################################################### [ req ] -default_bits = 512 +default_bits = 1024 default_keyfile = keySS.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no diff --git a/src/lib/libssl/test/P2ss.cnf b/src/lib/libssl/test/P2ss.cnf index 373a87e7c2..8b502321b8 100644 --- a/src/lib/libssl/test/P2ss.cnf +++ b/src/lib/libssl/test/P2ss.cnf @@ -7,7 +7,7 @@ RANDFILE = ./.rnd #################################################################### [ req ] -default_bits = 512 +default_bits = 1024 default_keyfile = keySS.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no diff --git a/src/lib/libssl/test/Uss.cnf b/src/lib/libssl/test/Uss.cnf index 0c0ebb5f67..58ac0ca54d 100644 --- a/src/lib/libssl/test/Uss.cnf +++ b/src/lib/libssl/test/Uss.cnf @@ -7,11 +7,11 @@ RANDFILE = ./.rnd #################################################################### [ req ] -default_bits = 512 +default_bits = 2048 default_keyfile = keySS.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no -default_md = md2 +default_md = sha256 [ req_distinguished_name ] countryName = Country Name (2 letter code) diff --git a/src/lib/libssl/test/pkits-test.pl b/src/lib/libssl/test/pkits-test.pl index 69dffa16f9..5c6b89fcdb 100644 --- a/src/lib/libssl/test/pkits-test.pl +++ b/src/lib/libssl/test/pkits-test.pl @@ -784,6 +784,15 @@ my $ossl = "ossl/apps/openssl"; my $ossl_cmd = "$ossl_path cms -verify -verify_retcode "; $ossl_cmd .= "-CAfile pkitsta.pem -crl_check_all -x509_strict "; + +# Check for expiry of trust anchor +system "$ossl_path x509 -inform DER -in $pkitsta -checkend 0"; +if ($? == 256) + { + print STDERR "WARNING: using older expired data\n"; + $ossl_cmd .= "-attime 1291940972 "; + } + $ossl_cmd .= "-policy_check -extended_crl -use_deltas -out /dev/null 2>&1 "; system "$ossl_path x509 -inform DER -in $pkitsta -out pkitsta.pem"; diff --git a/src/lib/libssl/test/test.cnf b/src/lib/libssl/test/test.cnf index faad3914a8..10834442a1 100644 --- a/src/lib/libssl/test/test.cnf +++ b/src/lib/libssl/test/test.cnf @@ -56,7 +56,7 @@ emailAddress = optional #################################################################### [ req ] -default_bits = 512 +default_bits = 1024 default_keyfile = testkey.pem distinguished_name = req_distinguished_name encrypt_rsa_key = no diff --git a/src/lib/libssl/test/testssl b/src/lib/libssl/test/testssl index b55364ae88..5ae4dc8720 100644 --- a/src/lib/libssl/test/testssl +++ b/src/lib/libssl/test/testssl @@ -148,4 +148,14 @@ $ssltest -tls1 -cipher PSK -psk abc123 $extra || exit 1 echo test tls1 with PSK via BIO pair $ssltest -bio_pair -tls1 -cipher PSK -psk abc123 $extra || exit 1 +if ../util/shlib_wrap.sh ../apps/openssl no-srp; then + echo skipping SRP tests +else + echo test tls1 with SRP + $ssltest -tls1 -cipher SRP -srpuser test -srppass abc123 + + echo test tls1 with SRP via BIO pair + $ssltest -bio_pair -tls1 -cipher SRP -srpuser test -srppass abc123 +fi + exit 0 diff --git a/src/lib/libssl/tls1.h b/src/lib/libssl/tls1.h index b3cc8f098b..c39c267f0b 100644 --- a/src/lib/libssl/tls1.h +++ b/src/lib/libssl/tls1.h @@ -159,10 +159,24 @@ extern "C" { #define TLS1_ALLOW_EXPERIMENTAL_CIPHERSUITES 0 +#define TLS1_2_VERSION 0x0303 +#define TLS1_2_VERSION_MAJOR 0x03 +#define TLS1_2_VERSION_MINOR 0x03 + +#define TLS1_1_VERSION 0x0302 +#define TLS1_1_VERSION_MAJOR 0x03 +#define TLS1_1_VERSION_MINOR 0x02 + #define TLS1_VERSION 0x0301 #define TLS1_VERSION_MAJOR 0x03 #define TLS1_VERSION_MINOR 0x01 +#define TLS1_get_version(s) \ + ((s->version >> 8) == TLS1_VERSION_MAJOR ? s->version : 0) + +#define TLS1_get_client_version(s) \ + ((s->client_version >> 8) == TLS1_VERSION_MAJOR ? s->client_version : 0) + #define TLS1_AD_DECRYPTION_FAILED 21 #define TLS1_AD_RECORD_OVERFLOW 22 #define TLS1_AD_UNKNOWN_CA 48 /* fatal */ @@ -183,17 +197,42 @@ extern "C" { #define TLS1_AD_BAD_CERTIFICATE_HASH_VALUE 114 #define TLS1_AD_UNKNOWN_PSK_IDENTITY 115 /* fatal */ -/* ExtensionType values from RFC3546 / RFC4366 */ +/* ExtensionType values from RFC3546 / RFC4366 / RFC6066 */ #define TLSEXT_TYPE_server_name 0 #define TLSEXT_TYPE_max_fragment_length 1 #define TLSEXT_TYPE_client_certificate_url 2 #define TLSEXT_TYPE_trusted_ca_keys 3 #define TLSEXT_TYPE_truncated_hmac 4 #define TLSEXT_TYPE_status_request 5 +/* ExtensionType values from RFC4681 */ +#define TLSEXT_TYPE_user_mapping 6 + +/* ExtensionType values from RFC5878 */ +#define TLSEXT_TYPE_client_authz 7 +#define TLSEXT_TYPE_server_authz 8 + +/* ExtensionType values from RFC6091 */ +#define TLSEXT_TYPE_cert_type 9 + /* ExtensionType values from RFC4492 */ #define TLSEXT_TYPE_elliptic_curves 10 #define TLSEXT_TYPE_ec_point_formats 11 + +/* ExtensionType value from RFC5054 */ +#define TLSEXT_TYPE_srp 12 + +/* ExtensionType values from RFC5246 */ +#define TLSEXT_TYPE_signature_algorithms 13 + +/* ExtensionType value from RFC5764 */ +#define TLSEXT_TYPE_use_srtp 14 + +/* ExtensionType value from RFC5620 */ +#define TLSEXT_TYPE_heartbeat 15 + +/* ExtensionType value from RFC4507 */ #define TLSEXT_TYPE_session_ticket 35 + /* ExtensionType value from draft-rescorla-tls-opaque-prf-input-00.txt */ #if 0 /* will have to be provided externally for now , * i.e. build with -DTLSEXT_TYPE_opaque_prf_input=38183 @@ -204,6 +243,11 @@ extern "C" { /* Temporary extension type */ #define TLSEXT_TYPE_renegotiate 0xff01 +#ifndef OPENSSL_NO_NEXTPROTONEG +/* This is not an IANA defined extension number */ +#define TLSEXT_TYPE_next_proto_neg 13172 +#endif + /* NameType value from RFC 3546 */ #define TLSEXT_NAMETYPE_host_name 0 /* status request value from RFC 3546 */ @@ -216,12 +260,37 @@ extern "C" { #define TLSEXT_ECPOINTFORMAT_ansiX962_compressed_char2 2 #define TLSEXT_ECPOINTFORMAT_last 2 +/* Signature and hash algorithms from RFC 5246 */ + +#define TLSEXT_signature_anonymous 0 +#define TLSEXT_signature_rsa 1 +#define TLSEXT_signature_dsa 2 +#define TLSEXT_signature_ecdsa 3 + +#define TLSEXT_hash_none 0 +#define TLSEXT_hash_md5 1 +#define TLSEXT_hash_sha1 2 +#define TLSEXT_hash_sha224 3 +#define TLSEXT_hash_sha256 4 +#define TLSEXT_hash_sha384 5 +#define TLSEXT_hash_sha512 6 + #ifndef OPENSSL_NO_TLSEXT #define TLSEXT_MAXLEN_host_name 255 -const char *SSL_get_servername(const SSL *s, const int type) ; -int SSL_get_servername_type(const SSL *s) ; +const char *SSL_get_servername(const SSL *s, const int type); +int SSL_get_servername_type(const SSL *s); +/* SSL_export_keying_material exports a value derived from the master secret, + * as specified in RFC 5705. It writes |olen| bytes to |out| given a label and + * optional context. (Since a zero length context is allowed, the |use_context| + * flag controls whether a context is included.) + * + * It returns 1 on success and zero otherwise. + */ +int SSL_export_keying_material(SSL *s, unsigned char *out, size_t olen, + const char *label, size_t llen, const unsigned char *p, size_t plen, + int use_context); #define SSL_set_tlsext_host_name(s,name) \ SSL_ctrl(s,SSL_CTRL_SET_TLSEXT_HOSTNAME,TLSEXT_NAMETYPE_host_name,(char *)name) @@ -285,6 +354,16 @@ SSL_CTX_ctrl(ctx,SSL_CTRL_SET_TLSEXT_OPAQUE_PRF_INPUT_CB_ARG, 0, arg) #define SSL_CTX_set_tlsext_ticket_key_cb(ssl, cb) \ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) +#ifndef OPENSSL_NO_HEARTBEATS +#define SSL_TLSEXT_HB_ENABLED 0x01 +#define SSL_TLSEXT_HB_DONT_SEND_REQUESTS 0x02 +#define SSL_TLSEXT_HB_DONT_RECV_REQUESTS 0x04 + +#define SSL_get_tlsext_heartbeat_pending(ssl) \ + SSL_ctrl((ssl),SSL_CTRL_GET_TLS_EXT_HEARTBEAT_PENDING,0,NULL) +#define SSL_set_tlsext_heartbeat_no_requests(ssl, arg) \ + SSL_ctrl((ssl),SSL_CTRL_SET_TLS_EXT_HEARTBEAT_NO_REQUESTS,arg,NULL) +#endif #endif /* PSK ciphersuites from 4279 */ @@ -322,6 +401,14 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) #define TLS1_CK_DHE_RSA_WITH_AES_256_SHA 0x03000039 #define TLS1_CK_ADH_WITH_AES_256_SHA 0x0300003A +/* TLS v1.2 ciphersuites */ +#define TLS1_CK_RSA_WITH_NULL_SHA256 0x0300003B +#define TLS1_CK_RSA_WITH_AES_128_SHA256 0x0300003C +#define TLS1_CK_RSA_WITH_AES_256_SHA256 0x0300003D +#define TLS1_CK_DH_DSS_WITH_AES_128_SHA256 0x0300003E +#define TLS1_CK_DH_RSA_WITH_AES_128_SHA256 0x0300003F +#define TLS1_CK_DHE_DSS_WITH_AES_128_SHA256 0x03000040 + /* Camellia ciphersuites from RFC4132 */ #define TLS1_CK_RSA_WITH_CAMELLIA_128_CBC_SHA 0x03000041 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_128_CBC_SHA 0x03000042 @@ -330,6 +417,16 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) #define TLS1_CK_DHE_RSA_WITH_CAMELLIA_128_CBC_SHA 0x03000045 #define TLS1_CK_ADH_WITH_CAMELLIA_128_CBC_SHA 0x03000046 +/* TLS v1.2 ciphersuites */ +#define TLS1_CK_DHE_RSA_WITH_AES_128_SHA256 0x03000067 +#define TLS1_CK_DH_DSS_WITH_AES_256_SHA256 0x03000068 +#define TLS1_CK_DH_RSA_WITH_AES_256_SHA256 0x03000069 +#define TLS1_CK_DHE_DSS_WITH_AES_256_SHA256 0x0300006A +#define TLS1_CK_DHE_RSA_WITH_AES_256_SHA256 0x0300006B +#define TLS1_CK_ADH_WITH_AES_128_SHA256 0x0300006C +#define TLS1_CK_ADH_WITH_AES_256_SHA256 0x0300006D + +/* Camellia ciphersuites from RFC4132 */ #define TLS1_CK_RSA_WITH_CAMELLIA_256_CBC_SHA 0x03000084 #define TLS1_CK_DH_DSS_WITH_CAMELLIA_256_CBC_SHA 0x03000085 #define TLS1_CK_DH_RSA_WITH_CAMELLIA_256_CBC_SHA 0x03000086 @@ -345,6 +442,20 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) #define TLS1_CK_DHE_RSA_WITH_SEED_SHA 0x0300009A #define TLS1_CK_ADH_WITH_SEED_SHA 0x0300009B +/* TLS v1.2 GCM ciphersuites from RFC5288 */ +#define TLS1_CK_RSA_WITH_AES_128_GCM_SHA256 0x0300009C +#define TLS1_CK_RSA_WITH_AES_256_GCM_SHA384 0x0300009D +#define TLS1_CK_DHE_RSA_WITH_AES_128_GCM_SHA256 0x0300009E +#define TLS1_CK_DHE_RSA_WITH_AES_256_GCM_SHA384 0x0300009F +#define TLS1_CK_DH_RSA_WITH_AES_128_GCM_SHA256 0x030000A0 +#define TLS1_CK_DH_RSA_WITH_AES_256_GCM_SHA384 0x030000A1 +#define TLS1_CK_DHE_DSS_WITH_AES_128_GCM_SHA256 0x030000A2 +#define TLS1_CK_DHE_DSS_WITH_AES_256_GCM_SHA384 0x030000A3 +#define TLS1_CK_DH_DSS_WITH_AES_128_GCM_SHA256 0x030000A4 +#define TLS1_CK_DH_DSS_WITH_AES_256_GCM_SHA384 0x030000A5 +#define TLS1_CK_ADH_WITH_AES_128_GCM_SHA256 0x030000A6 +#define TLS1_CK_ADH_WITH_AES_256_GCM_SHA384 0x030000A7 + /* ECC ciphersuites from draft-ietf-tls-ecc-12.txt with changes soon to be in draft 13 */ #define TLS1_CK_ECDH_ECDSA_WITH_NULL_SHA 0x0300C001 #define TLS1_CK_ECDH_ECDSA_WITH_RC4_128_SHA 0x0300C002 @@ -376,6 +487,38 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) #define TLS1_CK_ECDH_anon_WITH_AES_128_CBC_SHA 0x0300C018 #define TLS1_CK_ECDH_anon_WITH_AES_256_CBC_SHA 0x0300C019 +/* SRP ciphersuites from RFC 5054 */ +#define TLS1_CK_SRP_SHA_WITH_3DES_EDE_CBC_SHA 0x0300C01A +#define TLS1_CK_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA 0x0300C01B +#define TLS1_CK_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA 0x0300C01C +#define TLS1_CK_SRP_SHA_WITH_AES_128_CBC_SHA 0x0300C01D +#define TLS1_CK_SRP_SHA_RSA_WITH_AES_128_CBC_SHA 0x0300C01E +#define TLS1_CK_SRP_SHA_DSS_WITH_AES_128_CBC_SHA 0x0300C01F +#define TLS1_CK_SRP_SHA_WITH_AES_256_CBC_SHA 0x0300C020 +#define TLS1_CK_SRP_SHA_RSA_WITH_AES_256_CBC_SHA 0x0300C021 +#define TLS1_CK_SRP_SHA_DSS_WITH_AES_256_CBC_SHA 0x0300C022 + +/* ECDH HMAC based ciphersuites from RFC5289 */ + +#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_SHA256 0x0300C023 +#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_SHA384 0x0300C024 +#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_SHA256 0x0300C025 +#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_SHA384 0x0300C026 +#define TLS1_CK_ECDHE_RSA_WITH_AES_128_SHA256 0x0300C027 +#define TLS1_CK_ECDHE_RSA_WITH_AES_256_SHA384 0x0300C028 +#define TLS1_CK_ECDH_RSA_WITH_AES_128_SHA256 0x0300C029 +#define TLS1_CK_ECDH_RSA_WITH_AES_256_SHA384 0x0300C02A + +/* ECDH GCM based ciphersuites from RFC5289 */ +#define TLS1_CK_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 0x0300C02B +#define TLS1_CK_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 0x0300C02C +#define TLS1_CK_ECDH_ECDSA_WITH_AES_128_GCM_SHA256 0x0300C02D +#define TLS1_CK_ECDH_ECDSA_WITH_AES_256_GCM_SHA384 0x0300C02E +#define TLS1_CK_ECDHE_RSA_WITH_AES_128_GCM_SHA256 0x0300C02F +#define TLS1_CK_ECDHE_RSA_WITH_AES_256_GCM_SHA384 0x0300C030 +#define TLS1_CK_ECDH_RSA_WITH_AES_128_GCM_SHA256 0x0300C031 +#define TLS1_CK_ECDH_RSA_WITH_AES_256_GCM_SHA384 0x0300C032 + /* XXX * Inconsistency alert: * The OpenSSL names of ciphers with ephemeral DH here include the string @@ -443,6 +586,17 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) #define TLS1_TXT_PSK_WITH_AES_128_CBC_SHA "PSK-AES128-CBC-SHA" #define TLS1_TXT_PSK_WITH_AES_256_CBC_SHA "PSK-AES256-CBC-SHA" +/* SRP ciphersuite from RFC 5054 */ +#define TLS1_TXT_SRP_SHA_WITH_3DES_EDE_CBC_SHA "SRP-3DES-EDE-CBC-SHA" +#define TLS1_TXT_SRP_SHA_RSA_WITH_3DES_EDE_CBC_SHA "SRP-RSA-3DES-EDE-CBC-SHA" +#define TLS1_TXT_SRP_SHA_DSS_WITH_3DES_EDE_CBC_SHA "SRP-DSS-3DES-EDE-CBC-SHA" +#define TLS1_TXT_SRP_SHA_WITH_AES_128_CBC_SHA "SRP-AES-128-CBC-SHA" +#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_128_CBC_SHA "SRP-RSA-AES-128-CBC-SHA" +#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_128_CBC_SHA "SRP-DSS-AES-128-CBC-SHA" +#define TLS1_TXT_SRP_SHA_WITH_AES_256_CBC_SHA "SRP-AES-256-CBC-SHA" +#define TLS1_TXT_SRP_SHA_RSA_WITH_AES_256_CBC_SHA "SRP-RSA-AES-256-CBC-SHA" +#define TLS1_TXT_SRP_SHA_DSS_WITH_AES_256_CBC_SHA "SRP-DSS-AES-256-CBC-SHA" + /* Camellia ciphersuites from RFC4132 */ #define TLS1_TXT_RSA_WITH_CAMELLIA_128_CBC_SHA "CAMELLIA128-SHA" #define TLS1_TXT_DH_DSS_WITH_CAMELLIA_128_CBC_SHA "DH-DSS-CAMELLIA128-SHA" @@ -466,6 +620,55 @@ SSL_CTX_callback_ctrl(ssl,SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB,(void (*)(void))cb) #define TLS1_TXT_DHE_RSA_WITH_SEED_SHA "DHE-RSA-SEED-SHA" #define TLS1_TXT_ADH_WITH_SEED_SHA "ADH-SEED-SHA" +/* TLS v1.2 ciphersuites */ +#define TLS1_TXT_RSA_WITH_NULL_SHA256 "NULL-SHA256" +#define TLS1_TXT_RSA_WITH_AES_128_SHA256 "AES128-SHA256" +#define TLS1_TXT_RSA_WITH_AES_256_SHA256 "AES256-SHA256" +#define TLS1_TXT_DH_DSS_WITH_AES_128_SHA256 "DH-DSS-AES128-SHA256" +#define TLS1_TXT_DH_RSA_WITH_AES_128_SHA256 "DH-RSA-AES128-SHA256" +#define TLS1_TXT_DHE_DSS_WITH_AES_128_SHA256 "DHE-DSS-AES128-SHA256" +#define TLS1_TXT_DHE_RSA_WITH_AES_128_SHA256 "DHE-RSA-AES128-SHA256" +#define TLS1_TXT_DH_DSS_WITH_AES_256_SHA256 "DH-DSS-AES256-SHA256" +#define TLS1_TXT_DH_RSA_WITH_AES_256_SHA256 "DH-RSA-AES256-SHA256" +#define TLS1_TXT_DHE_DSS_WITH_AES_256_SHA256 "DHE-DSS-AES256-SHA256" +#define TLS1_TXT_DHE_RSA_WITH_AES_256_SHA256 "DHE-RSA-AES256-SHA256" +#define TLS1_TXT_ADH_WITH_AES_128_SHA256 "ADH-AES128-SHA256" +#define TLS1_TXT_ADH_WITH_AES_256_SHA256 "ADH-AES256-SHA256" + +/* TLS v1.2 GCM ciphersuites from RFC5288 */ +#define TLS1_TXT_RSA_WITH_AES_128_GCM_SHA256 "AES128-GCM-SHA256" +#define TLS1_TXT_RSA_WITH_AES_256_GCM_SHA384 "AES256-GCM-SHA384" +#define TLS1_TXT_DHE_RSA_WITH_AES_128_GCM_SHA256 "DHE-RSA-AES128-GCM-SHA256" +#define TLS1_TXT_DHE_RSA_WITH_AES_256_GCM_SHA384 "DHE-RSA-AES256-GCM-SHA384" +#define TLS1_TXT_DH_RSA_WITH_AES_128_GCM_SHA256 "DH-RSA-AES128-GCM-SHA256" +#define TLS1_TXT_DH_RSA_WITH_AES_256_GCM_SHA384 "DH-RSA-AES256-GCM-SHA384" +#define TLS1_TXT_DHE_DSS_WITH_AES_128_GCM_SHA256 "DHE-DSS-AES128-GCM-SHA256" +#define TLS1_TXT_DHE_DSS_WITH_AES_256_GCM_SHA384 "DHE-DSS-AES256-GCM-SHA384" +#define TLS1_TXT_DH_DSS_WITH_AES_128_GCM_SHA256 "DH-DSS-AES128-GCM-SHA256" +#define TLS1_TXT_DH_DSS_WITH_AES_256_GCM_SHA384 "DH-DSS-AES256-GCM-SHA384" +#define TLS1_TXT_ADH_WITH_AES_128_GCM_SHA256 "ADH-AES128-GCM-SHA256" +#define TLS1_TXT_ADH_WITH_AES_256_GCM_SHA384 "ADH-AES256-GCM-SHA384" + +/* ECDH HMAC based ciphersuites from RFC5289 */ + +#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_SHA256 "ECDHE-ECDSA-AES128-SHA256" +#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_SHA384 "ECDHE-ECDSA-AES256-SHA384" +#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_SHA256 "ECDH-ECDSA-AES128-SHA256" +#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_SHA384 "ECDH-ECDSA-AES256-SHA384" +#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_SHA256 "ECDHE-RSA-AES128-SHA256" +#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_SHA384 "ECDHE-RSA-AES256-SHA384" +#define TLS1_TXT_ECDH_RSA_WITH_AES_128_SHA256 "ECDH-RSA-AES128-SHA256" +#define TLS1_TXT_ECDH_RSA_WITH_AES_256_SHA384 "ECDH-RSA-AES256-SHA384" + +/* ECDH GCM based ciphersuites from RFC5289 */ +#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 "ECDHE-ECDSA-AES128-GCM-SHA256" +#define TLS1_TXT_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 "ECDHE-ECDSA-AES256-GCM-SHA384" +#define TLS1_TXT_ECDH_ECDSA_WITH_AES_128_GCM_SHA256 "ECDH-ECDSA-AES128-GCM-SHA256" +#define TLS1_TXT_ECDH_ECDSA_WITH_AES_256_GCM_SHA384 "ECDH-ECDSA-AES256-GCM-SHA384" +#define TLS1_TXT_ECDHE_RSA_WITH_AES_128_GCM_SHA256 "ECDHE-RSA-AES128-GCM-SHA256" +#define TLS1_TXT_ECDHE_RSA_WITH_AES_256_GCM_SHA384 "ECDHE-RSA-AES256-GCM-SHA384" +#define TLS1_TXT_ECDH_RSA_WITH_AES_128_GCM_SHA256 "ECDH-RSA-AES128-GCM-SHA256" +#define TLS1_TXT_ECDH_RSA_WITH_AES_256_GCM_SHA384 "ECDH-RSA-AES256-GCM-SHA384" #define TLS_CT_RSA_SIGN 1 #define TLS_CT_DSS_SIGN 2 -- cgit v1.2.3-55-g6feb