Merge branch 'busybox' into merge

author: Ron Yorston <rmy@pobox.com> 2021-10-13 14:37:51 +0100
committer: Ron Yorston <rmy@pobox.com> 2021-10-13 14:37:51 +0100
commit: 0ecf1aea459571b48dc68ddc2b7b9265740fa960 (patch)
tree: 491d6184a44b8b525a4ca35759d622aecd7f6344 /networking
parent: 4859ddcb20616718efbea12c6bf8b27c469b68de (diff)
parent: aaf3d5ba74c5da97ff80b61f30cb8dd225d39096 (diff)
download: busybox-w32-0ecf1aea459571b48dc68ddc2b7b9265740fa960.tar.gz
busybox-w32-0ecf1aea459571b48dc68ddc2b7b9265740fa960.tar.bz2
busybox-w32-0ecf1aea459571b48dc68ddc2b7b9265740fa960.zip
15 files changed, 1206 insertions, 1093 deletions
diff --git a/networking/Config.src b/networking/Config.src
index 04d644bc9..0942645c3 100644
--- a/networking/Config.src
+++ b/networking/Config.src
@@ -46,6 +46,32 @@ config VERBOSE_RESOLUTION_ERRORS
        "can't resolve 'hostname.com'" and want to know more.
        This may increase size of your executable a bit.
+config FEATURE_ETC_NETWORKS
+        bool "Support /etc/networks"
+        default n
+        help
+        Enable support for network names in /etc/networks. This is
+        a rarely used feature which allows you to use names
+        instead of IP/mask pairs in route command.
+config FEATURE_ETC_SERVICES
+        bool "Consult /etc/services even for well-known ports"
+        default n
+        help
+        Look up e.g. "telnet" and "http" in /etc/services file
+        instead of assuming ports 23 and 80.
+        This is almost never necessary (everybody uses standard ports),
+        and it makes sense to avoid reading this file.
+        If you disable this option, in the cases where port is explicitly
+        specified as a service name (e.g. "telnet HOST PORTNAME"),
+        it will still be looked up in /etc/services.
+config FEATURE_HWIB
+        bool "Support infiniband HW"
+        default y
+        help
+        Support for printing infiniband addresses in network applets.
 config FEATURE_TLS_SHA1
        bool "In TLS code, support ciphers which use deprecated SHA1"
        depends on TLS
diff --git a/networking/brctl.c b/networking/brctl.c
index c83aac6e0..956bd91f3 100644
--- a/networking/brctl.c
+++ b/networking/brctl.c
@@ -318,7 +318,7 @@ static void printf_xstrtou(const char *fmt)
        printf(fmt, xstrtou(filedata, 0));
 }
-static void show_bridge_port(const char *name)
+static NOINLINE void show_bridge_port(const char *name)
 {
        char pathbuf[IFNAMSIZ + sizeof("/brport/forward_delay_timer") + 8];
        char *sfx;
diff --git a/networking/httpd.c b/networking/httpd.c
index 71e3a723f..6cc189272 100644
--- a/networking/httpd.c
+++ b/networking/httpd.c
@@ -281,7 +281,7 @@
 //usage:        IF_NOT_PLATFORM_MINGW32(
 //usage:     "\n        -i              Inetd mode"
 //usage:        )
-//usage:     "\n        -f              Don't daemonize"
+//usage:     "\n        -f              Run in foreground"
 //usage:     "\n        -v[v]           Verbose"
 //usage:     "\n        -p [IP:]PORT    Bind to IP:PORT (default *:"STR(CONFIG_FEATURE_HTTPD_PORT_DEFAULT)")"
 //usage:        IF_FEATURE_HTTPD_SETUID(
@@ -1918,14 +1918,17 @@ static NOINLINE void send_file_and_exit(const char *url, int what)
                send_headers(HTTP_OK);
 #if ENABLE_FEATURE_USE_SENDFILE
        {
-                off_t offset = (range_start < 0) ? 0 : range_start;
+                off_t offset;
+                if (range_start < 0)
+                        range_start = 0;
+                offset = range_start;
                while (1) {
                        /* sz is rounded down to 64k */
                        ssize_t sz = MAXINT(ssize_t) - 0xffff;
                        IF_FEATURE_HTTPD_RANGES(if (sz > range_len) sz = range_len;)
                        count = sendfile(STDOUT_FILENO, fd, &offset, sz);
                        if (count < 0) {
-                                if (offset == range_start)
+                                if (offset == range_start) /* was it the very 1st sendfile? */
                                        break; /* fall back to read/write loop */
                                goto fin;
                        }
diff --git a/networking/ifplugd.c b/networking/ifplugd.c
index 18dcaff96..c4b6b9584 100644
--- a/networking/ifplugd.c
+++ b/networking/ifplugd.c
@@ -20,7 +20,7 @@
 //usage:       "[OPTIONS]"
 //usage:#define ifplugd_full_usage "\n\n"
 //usage:       "Network interface plug detection daemon\n"
-//usage:     "\n        -n              Don't daemonize"
+//usage:     "\n        -n              Run in foreground"
 //usage:     "\n        -s              Don't log to syslog"
 //usage:     "\n        -i IFACE        Interface"
 //usage:     "\n        -f/-F           Treat link detection error as link down/link up"
diff --git a/networking/ip.c b/networking/ip.c
index 85b1ba080..7c3208699 100644
--- a/networking/ip.c
+++ b/networking/ip.c
@@ -152,7 +152,7 @@
 //usage:#define iplink_trivial_usage
 //usage:       /*Usage:iplink*/"set IFACE [up|down] [arp on|off] [multicast on|off]\n"
 //usage:       "        [promisc on|off] [mtu NUM] [name NAME] [qlen NUM] [address MAC]\n"
-//usage:       "        [master IFACE | nomaster]"
+//usage:       "        [master IFACE | nomaster] [netns PID]"
 // * short help shows only "set" command, long help continues (with just one "\n")
 // * and shows all other commands:
 //usage:#define iplink_full_usage "\n"
diff --git a/networking/libiproute/iplink.c b/networking/libiproute/iplink.c
index 1a1064bdc..68d199044 100644
--- a/networking/libiproute/iplink.c
+++ b/networking/libiproute/iplink.c
@@ -153,6 +153,30 @@ static void set_master(char *dev, int master)
 }
 /* Exits on error */
+static void set_netns(char *dev, int netns)
+{
+        struct rtnl_handle rth;
+        struct {
+                struct nlmsghdr  n;
+                struct ifinfomsg i;
+                char             buf[1024];
+        } req;
+        memset(&req, 0, sizeof(req));
+        req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+        req.n.nlmsg_flags = NLM_F_REQUEST;
+        req.n.nlmsg_type = RTM_NEWLINK;
+        req.i.ifi_family = preferred_family;
+        xrtnl_open(&rth);
+        req.i.ifi_index = xll_name_to_index(dev);
+        //printf("netns %i for %i\n", netns, req.i.ifi_index);
+        addattr_l(&req.n, sizeof(req), IFLA_NET_NS_PID, &netns, 4);
+        if (rtnl_talk(&rth, &req.n, 0, 0, NULL, NULL, NULL) < 0)
+                xfunc_die();
+}
+/* Exits on error */
 static int get_address(char *dev, int *htype)
 {
        struct ifreq ifr;
@@ -226,6 +250,7 @@ static int do_set(char **argv)
        int qlen = -1;
        int mtu = -1;
        int master = -1;
+        int netns = -1;
        char *newaddr = NULL;
        char *newbrd = NULL;
        struct ifreq ifr0, ifr1;
@@ -234,11 +259,11 @@ static int do_set(char **argv)
        /* If you add stuff here, update iplink_full_usage */
        static const char keywords[] ALIGN1 =
                "up\0""down\0""name\0""mtu\0""qlen\0""multicast\0"
-                "arp\0""promisc\0""address\0"
+                "arp\0""promisc\0""address\0""netns\0"
                "master\0""nomaster\0"
                "dev\0" /* must be last */;
        enum { ARG_up = 0, ARG_down, ARG_name, ARG_mtu, ARG_qlen, ARG_multicast,
-                ARG_arp, ARG_promisc, ARG_addr,
+                ARG_arp, ARG_promisc, ARG_addr, ARG_netns,
                ARG_master, ARG_nomaster,
                ARG_dev };
        enum { PARM_on = 0, PARM_off };
@@ -276,6 +301,9 @@ static int do_set(char **argv)
                        master = xll_name_to_index(*argv);
                } else if (key == ARG_nomaster) {
                        master = 0;
+                } else if (key == ARG_netns) {
+                        NEXT_ARG();
+                        netns = get_unsigned(*argv, "netns");
                } else if (key >= ARG_dev) {
                        /* ^^^^^^ ">=" here results in "dev IFACE" treated as default */
                        if (key == ARG_dev) {
@@ -463,6 +491,9 @@ static int do_set(char **argv)
        if (master != -1) {
                set_master(dev, master);
        }
+        if (netns != -1) {
+                set_netns(dev, netns);
+        }
        if (mask)
                do_chflags(dev, flags, mask);
        return 0;
diff --git a/networking/nslookup.c b/networking/nslookup.c
index de7b5c0e7..6da97baf4 100644
--- a/networking/nslookup.c
+++ b/networking/nslookup.c
@@ -335,7 +335,7 @@ enum {
        OPT_debug = (1 << 0),
 };
-static int parse_reply(const unsigned char *msg, size_t len)
+static NOINLINE int parse_reply(const unsigned char *msg, size_t len)
 {
        HEADER *header;
diff --git a/networking/ntpd.c b/networking/ntpd.c
index 6bf6c4e07..204e1d7c2 100644
--- a/networking/ntpd.c
+++ b/networking/ntpd.c
@@ -78,7 +78,7 @@
 //usage:#define ntpd_full_usage "\n\n"
 //usage:       "NTP client/server\n"
 //usage:     "\n        -d[d]   Verbose"
-//usage:     "\n        -n      Do not daemonize"
+//usage:     "\n        -n      Run in foreground"
 //usage:     "\n        -q      Quit after clock is set"
 //usage:     "\n        -N      Run at high priority"
 //usage:     "\n        -w      Do not set time (only query peers), implies -n"
@@ -1152,7 +1152,7 @@ fit(peer_t *p, double rd)
 //              return 0;
        return 1;
 }
-static peer_t*
+static NOINLINE peer_t*
 select_and_cluster(void)
 {
        peer_t     *p;
diff --git a/networking/tls.c b/networking/tls.c
index a1b12f9ed..36f83212b 100644
--- a/networking/tls.c
+++ b/networking/tls.c
@@ -22,47 +22,23 @@
 #include "tls.h"
+// Usually enabled. You can disable some of them to force only
+// specific ciphers to be advertized to server.
+// (this would not exclude code to handle disabled ciphers, no code size win)
+#define ALLOW_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256       1
+#define ALLOW_ECDHE_RSA_WITH_AES_128_CBC_SHA256         1
+#define ALLOW_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256       1
+#define ALLOW_ECDHE_RSA_WITH_AES_128_GCM_SHA256         1
+#define ALLOW_RSA_WITH_AES_128_CBC_SHA256       1
+#define ALLOW_RSA_WITH_AES_256_CBC_SHA256       1
+#define ALLOW_RSA_WITH_AES_128_GCM_SHA256       1
+#define ALLOW_CURVE_P256        1
+#define ALLOW_CURVE_X25519      1
+// For testing (does everything except encrypting).
 // works against "openssl s_server -cipher NULL"
 // and against wolfssl-3.9.10-stable/examples/server/server.c:
-#define ALLOW_RSA_NULL_SHA256  0  // for testing (does everything except encrypting)
+#define ALLOW_RSA_NULL_SHA256                   0
-//Tested against kernel.org:
-//#define CIPHER_ID TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA // ok, recvs SERVER_KEY_EXCHANGE *** matrixssl uses this on my box
-//#define CIPHER_ID TLS_RSA_WITH_AES_256_CBC_SHA256 // ok, no SERVER_KEY_EXCHANGE
-//#define CIPHER_ID TLS_DH_anon_WITH_AES_256_CBC_SHA // SSL_ALERT_HANDSHAKE_FAILURE
-//^^^^^^^^^^^^^^^^^^^^^^^ (tested b/c this one doesn't req server certs... no luck, server refuses it)
-//#define CIPHER_ID TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 // SSL_ALERT_HANDSHAKE_FAILURE
-//#define CIPHER_ID TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 // SSL_ALERT_HANDSHAKE_FAILURE
-//#define CIPHER_ID TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 // ok, recvs SERVER_KEY_EXCHANGE
-//#define CIPHER_ID TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256
-//#define CIPHER_ID TLS_ECDH_ECDSA_WITH_AES_256_GCM_SHA384
-//#define CIPHER_ID TLS_ECDH_ECDSA_WITH_AES_128_GCM_SHA256 // SSL_ALERT_HANDSHAKE_FAILURE
-//#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_256_GCM_SHA384
-//#define CIPHER_ID TLS_ECDH_RSA_WITH_AES_128_GCM_SHA256 // SSL_ALERT_HANDSHAKE_FAILURE
-//#define CIPHER_ID TLS_RSA_WITH_AES_256_GCM_SHA384 // ok, no SERVER_KEY_EXCHANGE
-//#define CIPHER_ID TLS_RSA_WITH_AES_128_GCM_SHA256 // ok, no SERVER_KEY_EXCHANGE
-// works against wolfssl-3.9.10-stable/examples/server/server.c
-// works for kernel.org
-// does not work for cdn.kernel.org (e.g. downloading an actual tarball, not a web page)
-//  getting alert 40 "handshake failure" at once
-//  with GNU Wget 1.18, they agree on TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 (0xC02F) cipher
-//  fail: openssl s_client -connect cdn.kernel.org:443 -debug -tls1_2 -cipher AES256-SHA256
-//  fail: openssl s_client -connect cdn.kernel.org:443 -debug -tls1_2 -cipher AES256-GCM-SHA384
-//  fail: openssl s_client -connect cdn.kernel.org:443 -debug -tls1_2 -cipher AES128-SHA256
-//  ok:   openssl s_client -connect cdn.kernel.org:443 -debug -tls1_2 -cipher AES128-GCM-SHA256
-//  ok:   openssl s_client -connect cdn.kernel.org:443 -debug -tls1_2 -cipher AES128-SHA
-//        (TLS_RSA_WITH_AES_128_CBC_SHA - in TLS 1.2 it's mandated to be always supported)
-//#define CIPHER_ID1  TLS_RSA_WITH_AES_256_CBC_SHA256 //0x003D
-// Works with "wget https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.9.5.tar.xz"
-//#define CIPHER_ID2  TLS_RSA_WITH_AES_128_CBC_SHA    //0x002F
-// bug #11456:
-// ftp.openbsd.org only supports ECDHE-RSA-AESnnn-GCM-SHAnnn or ECDHE-RSA-CHACHA20-POLY1305
-//#define CIPHER_ID3  TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 //0xC02F
-// host is.gd accepts only ECDHE-ECDSA-foo (the simplest which works: ECDHE-ECDSA-AES128-SHA 0xC009)
-//#define CIPHER_ID4  TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA  //0xC009
 #define TLS_DEBUG      0
 #define TLS_DEBUG_HASH 0
@@ -1488,9 +1464,20 @@ static ALWAYS_INLINE void fill_handshake_record_hdr(void *buf, unsigned type, un
 static void send_client_hello_and_alloc_hsd(tls_state_t *tls, const char *sni)
 {
-#define NUM_CIPHERS (7 + 6 * ENABLE_FEATURE_TLS_SHA1 + ALLOW_RSA_NULL_SHA256)
+#define NUM_CIPHERS (0 \
+        + 4 * ENABLE_FEATURE_TLS_SHA1 \
+        + ALLOW_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256 \
+        + ALLOW_ECDHE_RSA_WITH_AES_128_CBC_SHA256 \
+        + ALLOW_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 \
+        + ALLOW_ECDHE_RSA_WITH_AES_128_GCM_SHA256 \
+        + 2 * ENABLE_FEATURE_TLS_SHA1 \
+        + ALLOW_RSA_WITH_AES_128_CBC_SHA256 \
+        + ALLOW_RSA_WITH_AES_256_CBC_SHA256 \
+        + ALLOW_RSA_WITH_AES_128_GCM_SHA256 \
+        + ALLOW_RSA_NULL_SHA256 \
+        )
        static const uint8_t ciphers[] = {
-                0x00,2 + NUM_CIPHERS*2, //len16_be
+                0x00,2 * (1 + NUM_CIPHERS), //len16_be
                0x00,0xFF, //not a cipher - TLS_EMPTY_RENEGOTIATION_INFO_SCSV
                /* ^^^^^^ RFC 5746 Renegotiation Indication Extension - some servers will refuse to work with us otherwise */
 #if ENABLE_FEATURE_TLS_SHA1
@@ -1501,14 +1488,22 @@ static void send_client_hello_and_alloc_hsd(tls_state_t *tls, const char *sni)
        //      0xC0,0x18, //   TLS_ECDH_anon_WITH_AES_128_CBC_SHA
        //      0xC0,0x19, //   TLS_ECDH_anon_WITH_AES_256_CBC_SHA
 #endif
+#if ALLOW_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256
                0xC0,0x23, // 5 TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256 - ok: wget https://is.gd/
+#endif
        //      0xC0,0x24, //   TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384 - can't do SHA384 yet
+#if ALLOW_ECDHE_RSA_WITH_AES_128_CBC_SHA256
                0xC0,0x27, // 6 TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - ok: openssl s_server ... -cipher ECDHE-RSA-AES128-SHA256
+#endif
        //      0xC0,0x28, //   TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384 - can't do SHA384 yet
+#if ALLOW_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256
                0xC0,0x2B, // 7 TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 - ok: wget https://is.gd/
+#endif
        //      0xC0,0x2C, //   TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 - wget https://is.gd/: "TLS error from peer (alert code 20): bad MAC"
 //TODO: GCM_SHA384 ciphers can be supported, only need sha384-based PRF?
+#if ALLOW_ECDHE_RSA_WITH_AES_128_GCM_SHA256
                0xC0,0x2F, // 8 TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 - ok: openssl s_server ... -cipher ECDHE-RSA-AES128-GCM-SHA256
+#endif
        //      0xC0,0x30, //   TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 - openssl s_server ... -cipher ECDHE-RSA-AES256-GCM-SHA384: "decryption failed or bad record mac"
        //possibly these too:
 #if ENABLE_FEATURE_TLS_SHA1
@@ -1521,9 +1516,15 @@ static void send_client_hello_and_alloc_hsd(tls_state_t *tls, const char *sni)
                0x00,0x2F, // 9 TLS_RSA_WITH_AES_128_CBC_SHA - ok: openssl s_server ... -cipher AES128-SHA
                0x00,0x35, //10 TLS_RSA_WITH_AES_256_CBC_SHA - ok: openssl s_server ... -cipher AES256-SHA
 #endif
+#if ALLOW_RSA_WITH_AES_128_CBC_SHA256
                0x00,0x3C, //11 TLS_RSA_WITH_AES_128_CBC_SHA256 - ok: openssl s_server ... -cipher AES128-SHA256
+#endif
+#if ALLOW_RSA_WITH_AES_256_CBC_SHA256
                0x00,0x3D, //12 TLS_RSA_WITH_AES_256_CBC_SHA256 - ok: openssl s_server ... -cipher AES256-SHA256
+#endif
+#if ALLOW_RSA_WITH_AES_128_GCM_SHA256
                0x00,0x9C, //13 TLS_RSA_WITH_AES_128_GCM_SHA256 - ok: openssl s_server ... -cipher AES128-GCM-SHA256
+#endif
        //      0x00,0x9D, //   TLS_RSA_WITH_AES_256_GCM_SHA384 - openssl s_server ... -cipher AES256-GCM-SHA384: "decryption failed or bad record mac"
 #if ALLOW_RSA_NULL_SHA256
                0x00,0x3B, //   TLS_RSA_WITH_NULL_SHA256
@@ -1532,12 +1533,16 @@ static void send_client_hello_and_alloc_hsd(tls_state_t *tls, const char *sni)
        };
        static const uint8_t supported_groups[] = {
                0x00,0x0a, //extension_type: "supported_groups"
-                0x00,0x06, //ext len
+                0x00,2 * (1 + ALLOW_CURVE_P256 + ALLOW_CURVE_X25519), //ext len
-                0x00,0x04, //list len
+                0x00,2 * (0 + ALLOW_CURVE_P256 + ALLOW_CURVE_X25519), //list len
-                0x00,0x17, //curve_secp256r1 (aka P256)
+#if ALLOW_CURVE_P256
+                0x00,0x17, //curve_secp256r1 (aka P256, aka prime256v1)
+#endif
                //0x00,0x18, //curve_secp384r1
                //0x00,0x19, //curve_secp521r1
+#if ALLOW_CURVE_X25519
                0x00,0x1d, //curve_x25519 (RFC 7748)
+#endif
                //0x00,0x1e, //curve_x448 (RFC 7748)
        };
        //static const uint8_t signature_algorithms[] = {
@@ -1555,7 +1560,7 @@ static void send_client_hello_and_alloc_hsd(tls_state_t *tls, const char *sni)
                uint8_t session_id_len;
                /* uint8_t session_id[]; */
                uint8_t cipherid_len16_hi, cipherid_len16_lo;
-                uint8_t cipherid[2 + NUM_CIPHERS*2]; /* actually variable */
+                uint8_t cipherid[2 * (1 + NUM_CIPHERS)]; /* actually variable */
                uint8_t comprtypes_len;
                uint8_t comprtypes[1]; /* actually variable */
                /* Extensions (SNI shown):
@@ -1603,7 +1608,7 @@ static void send_client_hello_and_alloc_hsd(tls_state_t *tls, const char *sni)
                memset(record->rand32, 0x11, sizeof(record->rand32));
        /* record->session_id_len = 0; - already is */
-        BUILD_BUG_ON(sizeof(ciphers) != 2 + 2 + NUM_CIPHERS*2 + 2);
+        BUILD_BUG_ON(sizeof(ciphers) != 2 * (1 + 1 + NUM_CIPHERS + 1));
        memcpy(&record->cipherid_len16_hi, ciphers, sizeof(ciphers));
        ptr = (void*)(record + 1);
@@ -1700,42 +1705,33 @@ static void get_server_hello(tls_state_t *tls)
        /* Set up encryption params based on selected cipher */
 #if 0
-#if ENABLE_FEATURE_TLS_SHA1
                0xC0,0x09, // 1 TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA - ok: wget https://is.gd/
                0xC0,0x0A, // 2 TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA - ok: wget https://is.gd/
                0xC0,0x13, // 3 TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA - ok: openssl s_server ... -cipher ECDHE-RSA-AES128-SHA
                0xC0,0x14, // 4 TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA - ok: openssl s_server ... -cipher ECDHE-RSA-AES256-SHA (might fail with older openssl)
        //      0xC0,0x18, //   TLS_ECDH_anon_WITH_AES_128_CBC_SHA
        //      0xC0,0x19, //   TLS_ECDH_anon_WITH_AES_256_CBC_SHA
-#endif
                0xC0,0x23, // 5 TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256 - ok: wget https://is.gd/
        //      0xC0,0x24, //   TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384 - can't do SHA384 yet
                0xC0,0x27, // 6 TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - ok: openssl s_server ... -cipher ECDHE-RSA-AES128-SHA256
        //      0xC0,0x28, //   TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384 - can't do SHA384 yet
                0xC0,0x2B, // 7 TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256 - ok: wget https://is.gd/
        //      0xC0,0x2C, //   TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384 - wget https://is.gd/: "TLS error from peer (alert code 20): bad MAC"
-//TODO: GCM_SHA384 ciphers can be supported, only need sha384-based PRF?
                0xC0,0x2F, // 8 TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256 - ok: openssl s_server ... -cipher ECDHE-RSA-AES128-GCM-SHA256
        //      0xC0,0x30, //   TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 - openssl s_server ... -cipher ECDHE-RSA-AES256-GCM-SHA384: "decryption failed or bad record mac"
        //possibly these too:
-#if ENABLE_FEATURE_TLS_SHA1
        //      0xC0,0x35, //   TLS_ECDHE_PSK_WITH_AES_128_CBC_SHA
        //      0xC0,0x36, //   TLS_ECDHE_PSK_WITH_AES_256_CBC_SHA
-#endif
        //      0xC0,0x37, //   TLS_ECDHE_PSK_WITH_AES_128_CBC_SHA256
        //      0xC0,0x38, //   TLS_ECDHE_PSK_WITH_AES_256_CBC_SHA384 - can't do SHA384 yet
-#if ENABLE_FEATURE_TLS_SHA1
                0x00,0x2F, // 9 TLS_RSA_WITH_AES_128_CBC_SHA - ok: openssl s_server ... -cipher AES128-SHA
                0x00,0x35, //10 TLS_RSA_WITH_AES_256_CBC_SHA - ok: openssl s_server ... -cipher AES256-SHA
-#endif
                0x00,0x3C, //11 TLS_RSA_WITH_AES_128_CBC_SHA256 - ok: openssl s_server ... -cipher AES128-SHA256
                0x00,0x3D, //12 TLS_RSA_WITH_AES_256_CBC_SHA256 - ok: openssl s_server ... -cipher AES256-SHA256
                0x00,0x9C, //13 TLS_RSA_WITH_AES_128_GCM_SHA256 - ok: openssl s_server ... -cipher AES128-GCM-SHA256
        //      0x00,0x9D, //   TLS_RSA_WITH_AES_256_GCM_SHA384 - openssl s_server ... -cipher AES256-GCM-SHA384: "decryption failed or bad record mac"
-#if ALLOW_RSA_NULL_SHA256
                0x00,0x3B, //   TLS_RSA_WITH_NULL_SHA256
 #endif
-#endif
        cipherid1 = cipherid[1];
        tls->cipher_id = 0x100 * cipherid[0] + cipherid1;
        tls->key_size = AES256_KEYSIZE;
@@ -1944,7 +1940,7 @@ static void send_client_key_exchange(tls_state_t *tls)
        if (!(tls->flags & NEED_EC_KEY)) {
                /* RSA */
                if (!(tls->flags & GOT_CERT_RSA_KEY_ALG))
-                        bb_simple_error_msg("server cert is not RSA");
+                        bb_simple_error_msg_and_die("server cert is not RSA");
                tls_get_random(premaster, RSA_PREMASTER_SIZE);
                if (TLS_DEBUG_FIXED_SECRETS)
@@ -2330,6 +2326,47 @@ void FAST_FUNC tls_run_copy_loop(tls_state_t *tls, unsigned flags)
        const int INBUF_STEP = 4 * 1024;
        struct pollfd pfds[2];
+#if 0
+// Debug aid for comparing P256 implementations.
+// Enable this, set SP_DEBUG and FIXED_SECRET to 1,
+// and add
+//      tls_run_copy_loop(NULL, 0);
+// e.g. at the very beginning of wget_main()
+//
+{
+        uint8_t ecc_pub_key32[2 * 32];
+        uint8_t pubkey2x32[2 * 32];
+        uint8_t premaster32[32];
+//Fixed input key:
+//      memset(ecc_pub_key32, 0xee, sizeof(ecc_pub_key32));
+//Fixed 000000000000000000000000000000000000ab000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+//      memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32));
+//      ecc_pub_key32[18] = 0xab;
+//Random key:
+//      tls_get_random(ecc_pub_key32, sizeof(ecc_pub_key32));
+//Biased random (almost all zeros or almost all ones):
+        srand(time(NULL) ^ getpid());
+        if (rand() & 1)
+                memset(ecc_pub_key32, 0x00, sizeof(ecc_pub_key32));
+        else
+                memset(ecc_pub_key32, 0xff, sizeof(ecc_pub_key32));
+        ecc_pub_key32[rand() & 0x3f] = rand();
+        xmove_fd(xopen("p256.OLD", O_WRONLY | O_CREAT | O_TRUNC), 2);
+        curve_P256_compute_pubkey_and_premaster(
+                        pubkey2x32, premaster32,
+                        /*point:*/ ecc_pub_key32
+        );
+        xmove_fd(xopen("p256.NEW", O_WRONLY | O_CREAT | O_TRUNC), 2);
+        curve_P256_compute_pubkey_and_premaster_NEW(
+                        pubkey2x32, premaster32,
+                        /*point:*/ ecc_pub_key32
+        );
+        exit(1);
+}
+#endif
        pfds[0].fd = STDIN_FILENO;
        pfds[0].events = POLLIN;
        pfds[1].fd = tls->ifd;
diff --git a/networking/tls.h b/networking/tls.h
index 215e92b02..0173b87b2 100644
--- a/networking/tls.h
+++ b/networking/tls.h
@@ -101,7 +101,6 @@ void xorbuf_aligned_AES_BLOCK_SIZE(void* buf, const void* mask) FAST_FUNC;
 #include "tls_pstm.h"
-#include "tls_symmetric.h"
 #include "tls_aes.h"
 #include "tls_aesgcm.h"
 #include "tls_rsa.h"
@@ -117,3 +116,7 @@ void curve_x25519_compute_pubkey_and_premaster(
 void curve_P256_compute_pubkey_and_premaster(
                uint8_t *pubkey2x32, uint8_t *premaster32,
                const uint8_t *peerkey2x32) FAST_FUNC;
+void curve_P256_compute_pubkey_and_premaster_NEW(
+                uint8_t *pubkey2x32, uint8_t *premaster32,
+                const uint8_t *peerkey2x32) FAST_FUNC;
diff --git a/networking/tls_fe.c b/networking/tls_fe.c
index ecb410281..3a0a6776f 100644
--- a/networking/tls_fe.c
+++ b/networking/tls_fe.c
@@ -63,16 +63,22 @@ static void fprime_select(byte *dst, const byte *zero, const byte *one, byte con
 }
 #endif
+#if 0 /* constant-time */
 static void fe_select(byte *dst,
-                const byte *zero, const byte *one,
+                const byte *src,
                byte condition)
 {
        const byte mask = -condition;
        int i;
        for (i = 0; i < F25519_SIZE; i++)
-                dst[i] = zero[i] ^ (mask & (one[i] ^ zero[i]));
+                dst[i] = dst[i] ^ (mask & (src[i] ^ dst[i]));
 }
+#else
+# define fe_select(dst, src, condition) do { \
+        if (condition) lm_copy(dst, src); \
+} while (0)
+#endif
 #if 0 //UNUSED
 static void raw_add(byte *x, const byte *p)
@@ -225,7 +231,7 @@ static void fe_normalize(byte *x)
        minusp[31] = (byte)c;
        /* Load x-p if no underflow */
-        fe_select(x, minusp, x, (c >> 15) & 1);
+        fe_select(x, minusp, !(c & (1<<15)));
 }
 static void lm_add(byte* r, const byte* a, const byte* b)
@@ -548,26 +554,32 @@ static void curve25519(byte *result, const byte *e, const byte *q)
 {
        int i;
-        struct {
+        struct Z {
                /* for bbox's special case of q == NULL meaning "use basepoint" */
                /*static const*/ uint8_t basepoint9[CURVE25519_KEYSIZE]; // = {9};
                /* from wolfssl-3.15.3/wolfssl/wolfcrypt/fe_operations.h */
                /*static const*/ byte f25519_one[F25519_SIZE]; // = {1};
-                /* Current point: P_m */
-                byte xm[F25519_SIZE];
-                byte zm[F25519_SIZE]; // = {1};
                /* Predecessor: P_(m-1) */
                byte xm1[F25519_SIZE]; // = {1};
                byte zm1[F25519_SIZE]; // = {0};
+                /* Current point: P_m */
+                byte xm[F25519_SIZE];
+                byte zm[F25519_SIZE]; // = {1};
+                /* Temporaries */
+                byte xms[F25519_SIZE];
+                byte zms[F25519_SIZE];
        } z;
+        uint8_t *XM1 = (uint8_t*)&z + offsetof(struct Z,xm1); // gcc 11.0.0 workaround
 #define basepoint9 z.basepoint9
 #define f25519_one z.f25519_one
-#define xm         z.xm
-#define zm         z.zm
 #define xm1        z.xm1
 #define zm1        z.zm1
+#define xm         z.xm
+#define zm         z.zm
+#define xms        z.xms
+#define zms        z.zms
        memset(&z, 0, sizeof(z));
        f25519_one[0] = 1;
        zm[0] = 1;
@@ -583,8 +595,8 @@ static void curve25519(byte *result, const byte *e, const byte *q)
        for (i = 253; i >= 0; i--) {
                const int bit = (e[i >> 3] >> (i & 7)) & 1;
-                byte xms[F25519_SIZE];
+//              byte xms[F25519_SIZE];
-                byte zms[F25519_SIZE];
+//              byte zms[F25519_SIZE];
                /* From P_m and P_(m-1), compute P_(2m) and P_(2m-1) */
                xc_diffadd(xm1, zm1, q, f25519_one, xm, zm, xm1, zm1);
@@ -597,10 +609,22 @@ static void curve25519(byte *result, const byte *e, const byte *q)
                 *   bit = 1 --> (P_(2m+1), P_(2m))
                 *   bit = 0 --> (P_(2m), P_(2m-1))
                 */
-                fe_select(xm1, xm1, xm, bit);
+#if 0
-                fe_select(zm1, zm1, zm, bit);
+                fe_select(xm1, xm, bit);
-                fe_select(xm, xm, xms, bit);
+                fe_select(zm1, zm, bit);
-                fe_select(zm, zm, zms, bit);
+                fe_select(xm, xms, bit);
+                fe_select(zm, zms, bit);
+#else
+// same as above in about 50 bytes smaller code, but
+// requires that in-memory order is exactly xm1,zm1,xm,zm,xms,zms
+                if (bit) {
+                        //memcpy(xm1, xm, 4 * F25519_SIZE);
+                        //^^^ gcc 11.0.0 warns of overlapping memcpy
+                        //memmove(xm1, xm, 4 * F25519_SIZE);
+                        //^^^ gcc 11.0.0 warns of out-of-bounds access to xm1[]
+                        memmove(XM1, XM1 + 2 * F25519_SIZE, 4 * F25519_SIZE);
+                }
+#endif
        }
        /* Freeze out of projective coordinates */
diff --git a/networking/tls_pstm.h b/networking/tls_pstm.h
index bc7a0119a..56c6bb879 100644
--- a/networking/tls_pstm.h
+++ b/networking/tls_pstm.h
@@ -283,4 +283,3 @@ extern int32 pstm_invmod(psPool_t *pool, pstm_int * a, pstm_int * b,
        typedef int32 pstm_int;
 #endif /* !DISABLE_PSTM */
 #endif /* _h_PSTMATH */
diff --git a/networking/tls_sp_c32.c b/networking/tls_sp_c32.c
index 5a84852a5..4d4ecdd74 100644
--- a/networking/tls_sp_c32.c
+++ b/networking/tls_sp_c32.c
@@ -9,6 +9,8 @@
 #define FIXED_SECRET      0
 #define FIXED_PEER_PUBKEY 0
+#define ALLOW_ASM         1
 #if SP_DEBUG
 # define dbg(...) fprintf(stderr, __VA_ARGS__)
 static void dump_hex(const char *fmt, const void *vp, int len)
@@ -24,127 +26,98 @@ static void dump_hex(const char *fmt, const void *vp, int len)
 # define dump_hex(...) ((void)0)
 #endif
-#undef DIGIT_BIT
+typedef uint32_t sp_digit;
-#define DIGIT_BIT  32
+typedef int32_t signed_sp_digit;
-typedef int32_t sp_digit;
 /* The code below is taken from parts of
 *  wolfssl-3.15.3/wolfcrypt/src/sp_c32.c
 * and heavily modified.
- * Header comment is kept intact:
 */
-/* sp.c
- *
- * Copyright (C) 2006-2018 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
-/* Implementation by Sean Parkinson. */
 typedef struct sp_point {
-        sp_digit x[2 * 10];
+        sp_digit x[2 * 8];
-        sp_digit y[2 * 10];
+        sp_digit y[2 * 8];
-        sp_digit z[2 * 10];
+        sp_digit z[2 * 8];
        int infinity;
 } sp_point;
 /* The modulus (prime) of the curve P256. */
-static const sp_digit p256_mod[10] = {
+static const sp_digit p256_mod[8] = {
-        0x3ffffff,0x3ffffff,0x3ffffff,0x003ffff,0x0000000,
+        0xffffffff,0xffffffff,0xffffffff,0x00000000,
-        0x0000000,0x0000000,0x0000400,0x3ff0000,0x03fffff,
+        0x00000000,0x00000000,0x00000001,0xffffffff,
 };
 #define p256_mp_mod ((sp_digit)0x000001)
-/* Write r as big endian to byte aray.
+/* Normalize the values in each word to 32 bits - NOP */
+#define sp_256_norm_8(a) ((void)0)
+/* Write r as big endian to byte array.
 * Fixed length number of bytes written: 32
 *
 * r  A single precision integer.
 * a  Byte array.
 */
-static void sp_256_to_bin(sp_digit* r, uint8_t* a)
+static void sp_256_to_bin_8(const sp_digit* r, uint8_t* a)
 {
-        int i, j, s = 0, b;
+        int i;
-        for (i = 0; i < 9; i++) {
+        sp_256_norm_8(r);
-                r[i+1] += r[i] >> 26;
-                r[i] &= 0x3ffffff;
+        r += 8;
-        }
+        for (i = 0; i < 8; i++) {
-        j = 256 / 8 - 1;
+                r--;
-        a[j] = 0;
+                move_to_unaligned32(a, SWAP_BE32(*r));
-        for (i = 0; i < 10 && j >= 0; i++) {
+                a += 4;
-                b = 0;
-                a[j--] |= r[i] << s; b += 8 - s;
-                if (j < 0)
-                        break;
-                while (b < 26) {
-                        a[j--] = r[i] >> b; b += 8;
-                        if (j < 0)
-                                break;
-                }
-                s = 8 - (b - 26);
-                if (j >= 0)
-                        a[j] = 0;
-                if (s != 0)
-                        j++;
        }
 }
-/* Read big endian unsigned byte aray into r.
+/* Read big endian unsigned byte array into r.
 *
 * r  A single precision integer.
 * a  Byte array.
 * n  Number of bytes in array to read.
 */
-static void sp_256_from_bin(sp_digit* r, int max, const uint8_t* a, int n)
+static void sp_256_from_bin_8(sp_digit* r, const uint8_t* a)
 {
-        int i, j = 0, s = 0;
+        int i;
-        r[0] = 0;
+        r += 8;
-        for (i = n-1; i >= 0; i--) {
+        for (i = 0; i < 8; i++) {
-                r[j] |= ((sp_digit)a[i]) << s;
+                sp_digit v;
-                if (s >= 18) {
+                move_from_unaligned32(v, a);
-                        r[j] &= 0x3ffffff;
+                *--r = SWAP_BE32(v);
-                        s = 26 - s;
+                a += 4;
-                        if (j + 1 >= max)
-                                break;
-                        r[++j] = a[i] >> s;
-                        s = 8 - s;
-                }
-                else
-                        s += 8;
        }
+}
-        for (j++; j < max; j++)
+#if SP_DEBUG
-                r[j] = 0;
+static void dump_256(const char *fmt, const sp_digit* r)
+{
+        uint8_t b32[32];
+        sp_256_to_bin_8(r, b32);
+        dump_hex(fmt, b32, 32);
+}
+static void dump_512(const char *fmt, const sp_digit* r)
+{
+        uint8_t b64[64];
+        sp_256_to_bin_8(r, b64 + 32);
+        sp_256_to_bin_8(r+8, b64);
+        dump_hex(fmt, b64, 64);
 }
+#else
+# define dump_256(...) ((void)0)
+# define dump_512(...) ((void)0)
+#endif
 /* Convert a point of big-endian 32-byte x,y pair to type sp_point. */
 static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32)
 {
        memset(p, 0, sizeof(*p));
        /*p->infinity = 0;*/
-        sp_256_from_bin(p->x, 2 * 10, bin2x32, 32);
+        sp_256_from_bin_8(p->x, bin2x32);
-        sp_256_from_bin(p->y, 2 * 10, bin2x32 + 32, 32);
+        sp_256_from_bin_8(p->y, bin2x32 + 32);
-        //static const uint8_t one[1] = { 1 };
+        p->z[0] = 1; /* p->z = 1 */
-        //sp_256_from_bin(p->z, 2 * 10, one, 1);
-        p->z[0] = 1;
 }
 /* Compare a with b.
@@ -152,201 +125,650 @@ static void sp_256_point_from_bin2x32(sp_point* p, const uint8_t *bin2x32)
 * return -ve, 0 or +ve if a is less than, equal to or greater than b
 * respectively.
 */
-static sp_digit sp_256_cmp_10(const sp_digit* a, const sp_digit* b)
+static signed_sp_digit sp_256_cmp_8(const sp_digit* a, const sp_digit* b)
 {
-        sp_digit r;
        int i;
-        for (i = 9; i >= 0; i--) {
+        for (i = 7; i >= 0; i--) {
-                r = a[i] - b[i];
+/*              signed_sp_digit r = a[i] - b[i];
-                if (r != 0)
+ *              if (r != 0)
-                        break;
+ *                      return r;
+ * does not work: think about a[i]=0, b[i]=0xffffffff
+ */
+                if (a[i] == b[i])
+                        continue;
+                return (a[i] > b[i]) * 2 - 1;
        }
-        return r;
+        return 0;
 }
 /* Compare two numbers to determine if they are equal.
 *
 * return 1 when equal and 0 otherwise.
 */
-static int sp_256_cmp_equal_10(const sp_digit* a, const sp_digit* b)
+static int sp_256_cmp_equal_8(const sp_digit* a, const sp_digit* b)
 {
-        return sp_256_cmp_10(a, b) == 0;
+        return sp_256_cmp_8(a, b) == 0;
 }
-/* Normalize the values in each word to 26 bits. */
+/* Add b to a into r. (r = a + b). Return !0 on overflow */
-static void sp_256_norm_10(sp_digit* a)
+static int sp_256_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 {
+#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
+        sp_digit reg;
+        asm volatile (
+"\n             movl    (%0), %3"
+"\n             addl    (%1), %3"
+"\n             movl    %3, (%2)"
+"\n"
+"\n             movl    1*4(%0), %3"
+"\n             adcl    1*4(%1), %3"
+"\n             movl    %3, 1*4(%2)"
+"\n"
+"\n             movl    2*4(%0), %3"
+"\n             adcl    2*4(%1), %3"
+"\n             movl    %3, 2*4(%2)"
+"\n"
+"\n             movl    3*4(%0), %3"
+"\n             adcl    3*4(%1), %3"
+"\n             movl    %3, 3*4(%2)"
+"\n"
+"\n             movl    4*4(%0), %3"
+"\n             adcl    4*4(%1), %3"
+"\n             movl    %3, 4*4(%2)"
+"\n"
+"\n             movl    5*4(%0), %3"
+"\n             adcl    5*4(%1), %3"
+"\n             movl    %3, 5*4(%2)"
+"\n"
+"\n             movl    6*4(%0), %3"
+"\n             adcl    6*4(%1), %3"
+"\n             movl    %3, 6*4(%2)"
+"\n"
+"\n             movl    7*4(%0), %3"
+"\n             adcl    7*4(%1), %3"
+"\n             movl    %3, 7*4(%2)"
+"\n"
+"\n             sbbl    %3, %3"
+"\n"
+                : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+                : "0" (a), "1" (b), "2" (r)
+                : "memory"
+        );
+        return reg;
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+        /* x86_64 has no alignment restrictions, and is little-endian,
+         * so 64-bit and 32-bit representations are identical */
+        uint64_t reg;
+        asm volatile (
+"\n             movq    (%0), %3"
+"\n             addq    (%1), %3"
+"\n             movq    %3, (%2)"
+"\n"
+"\n             movq    1*8(%0), %3"
+"\n             adcq    1*8(%1), %3"
+"\n             movq    %3, 1*8(%2)"
+"\n"
+"\n             movq    2*8(%0), %3"
+"\n             adcq    2*8(%1), %3"
+"\n             movq    %3, 2*8(%2)"
+"\n"
+"\n             movq    3*8(%0), %3"
+"\n             adcq    3*8(%1), %3"
+"\n             movq    %3, 3*8(%2)"
+"\n"
+"\n             sbbq    %3, %3"
+"\n"
+                : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+                : "0" (a), "1" (b), "2" (r)
+                : "memory"
+        );
+        return reg;
+#else
        int i;
-        for (i = 0; i < 9; i++) {
+        sp_digit carry;
-                a[i+1] += a[i] >> 26;
-                a[i] &= 0x3ffffff;
+        carry = 0;
+        for (i = 0; i < 8; i++) {
+                sp_digit w, v;
+                w = b[i] + carry;
+                v = a[i];
+                if (w != 0) {
+                        v = a[i] + w;
+                        carry = (v < a[i]);
+                        /* hope compiler detects above as "carry flag set" */
+                }
+                /* else: b + carry == 0, two cases:
+                 * b:ffffffff, carry:1
+                 * b:00000000, carry:0
+                 * in either case, r[i] = a[i] and carry remains unchanged
+                 */
+                r[i] = v;
        }
+        return carry;
+#endif
 }
-/* Add b to a into r. (r = a + b) */
+/* Sub b from a into r. (r = a - b). Return !0 on underflow */
-static void sp_256_add_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
+static int sp_256_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 {
+#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
+        sp_digit reg;
+        asm volatile (
+"\n             movl    (%0), %3"
+"\n             subl    (%1), %3"
+"\n             movl    %3, (%2)"
+"\n"
+"\n             movl    1*4(%0), %3"
+"\n             sbbl    1*4(%1), %3"
+"\n             movl    %3, 1*4(%2)"
+"\n"
+"\n             movl    2*4(%0), %3"
+"\n             sbbl    2*4(%1), %3"
+"\n             movl    %3, 2*4(%2)"
+"\n"
+"\n             movl    3*4(%0), %3"
+"\n             sbbl    3*4(%1), %3"
+"\n             movl    %3, 3*4(%2)"
+"\n"
+"\n             movl    4*4(%0), %3"
+"\n             sbbl    4*4(%1), %3"
+"\n             movl    %3, 4*4(%2)"
+"\n"
+"\n             movl    5*4(%0), %3"
+"\n             sbbl    5*4(%1), %3"
+"\n             movl    %3, 5*4(%2)"
+"\n"
+"\n             movl    6*4(%0), %3"
+"\n             sbbl    6*4(%1), %3"
+"\n             movl    %3, 6*4(%2)"
+"\n"
+"\n             movl    7*4(%0), %3"
+"\n             sbbl    7*4(%1), %3"
+"\n             movl    %3, 7*4(%2)"
+"\n"
+"\n             sbbl    %3, %3"
+"\n"
+                : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+                : "0" (a), "1" (b), "2" (r)
+                : "memory"
+        );
+        return reg;
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+        /* x86_64 has no alignment restrictions, and is little-endian,
+         * so 64-bit and 32-bit representations are identical */
+        uint64_t reg;
+        asm volatile (
+"\n             movq    (%0), %3"
+"\n             subq    (%1), %3"
+"\n             movq    %3, (%2)"
+"\n"
+"\n             movq    1*8(%0), %3"
+"\n             sbbq    1*8(%1), %3"
+"\n             movq    %3, 1*8(%2)"
+"\n"
+"\n             movq    2*8(%0), %3"
+"\n             sbbq    2*8(%1), %3"
+"\n             movq    %3, 2*8(%2)"
+"\n"
+"\n             movq    3*8(%0), %3"
+"\n             sbbq    3*8(%1), %3"
+"\n             movq    %3, 3*8(%2)"
+"\n"
+"\n             sbbq    %3, %3"
+"\n"
+                : "=r" (a), "=r" (b), "=r" (r), "=r" (reg)
+                : "0" (a), "1" (b), "2" (r)
+                : "memory"
+        );
+        return reg;
+#else
        int i;
-        for (i = 0; i < 10; i++)
+        sp_digit borrow;
-                r[i] = a[i] + b[i];
+        borrow = 0;
+        for (i = 0; i < 8; i++) {
+                sp_digit w, v;
+                w = b[i] + borrow;
+                v = a[i];
+                if (w != 0) {
+                        v = a[i] - w;
+                        borrow = (v > a[i]);
+                        /* hope compiler detects above as "carry flag set" */
+                }
+                /* else: b + borrow == 0, two cases:
+                 * b:ffffffff, borrow:1
+                 * b:00000000, borrow:0
+                 * in either case, r[i] = a[i] and borrow remains unchanged
+                 */
+                r[i] = v;
+        }
+        return borrow;
+#endif
 }
-/* Sub b from a into r. (r = a - b) */
+/* Sub p256_mod from r. (r = r - p256_mod). */
-static void sp_256_sub_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
+#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
+static void sp_256_sub_8_p256_mod(sp_digit* r)
 {
-        int i;
+//p256_mod[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
-        for (i = 0; i < 10; i++)
+        asm volatile (
-                r[i] = a[i] - b[i];
+"\n             subl    $0xffffffff, (%0)"
+"\n             sbbl    $0xffffffff, 1*4(%0)"
+"\n             sbbl    $0xffffffff, 2*4(%0)"
+"\n             sbbl    $0, 3*4(%0)"
+"\n             sbbl    $0, 4*4(%0)"
+"\n             sbbl    $0, 5*4(%0)"
+"\n             sbbl    $1, 6*4(%0)"
+"\n             sbbl    $0xffffffff, 7*4(%0)"
+"\n"
+                : "=r" (r)
+                : "0" (r)
+                : "memory"
+        );
 }
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
-/* Shift number left one bit. Bottom bit is lost. */
+static void sp_256_sub_8_p256_mod(sp_digit* r)
-static void sp_256_rshift1_10(sp_digit* r, sp_digit* a)
 {
-        int i;
+        uint64_t reg;
-        for (i = 0; i < 9; i++)
+        uint64_t ooff;
-                r[i] = ((a[i] >> 1) | (a[i + 1] << 25)) & 0x3ffffff;
+//p256_mod[3..0] = ffffffff00000001 0000000000000000 00000000ffffffff ffffffffffffffff
-        r[9] = a[9] >> 1;
+        asm volatile (
+"\n             addq    $1, (%0)"       // adding 1 is the same as subtracting ffffffffffffffff
+"\n             cmc"                    // only carry bit needs inverting
+"\n"
+"\n             sbbq    %1, 1*8(%0)"    // %1 holds 00000000ffffffff
+"\n"
+"\n             sbbq    $0, 2*8(%0)"
+"\n"
+"\n             movq    3*8(%0), %2"
+"\n             sbbq    $0, %2"         // adding 00000000ffffffff (in %1)
+"\n             addq    %1, %2"         // is the same as subtracting ffffffff00000001
+"\n             movq    %2, 3*8(%0)"
+"\n"
+                : "=r" (r), "=r" (ooff), "=r" (reg)
+                : "0" (r), "1" (0x00000000ffffffff)
+                : "memory"
+        );
 }
+#else
-/* Mul a by scalar b and add into r. (r += a * b) */
+static void sp_256_sub_8_p256_mod(sp_digit* r)
-static void sp_256_mul_add_10(sp_digit* r, const sp_digit* a, sp_digit b)
 {
-        int64_t tb = b;
+        sp_256_sub_8(r, r, p256_mod);
-        int64_t t = 0;
-        int i;
-        for (i = 0; i < 10; i++) {
-                t += (tb * a[i]) + r[i];
-                r[i] = t & 0x3ffffff;
-                t >>= 26;
-        }
-        r[10] += t;
 }
+#endif
 /* Multiply a and b into r. (r = a * b) */
-static void sp_256_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b)
+static void sp_256_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b)
 {
+#if ALLOW_ASM && defined(__GNUC__) && defined(__i386__)
+        sp_digit rr[15]; /* in case r coincides with a or b */
+        int k;
+        uint32_t accl;
+        uint32_t acch;
+        acch = accl = 0;
+        for (k = 0; k < 15; k++) {
+                int i, j;
+                uint32_t acc_hi;
+                i = k - 7;
+                if (i < 0)
+                        i = 0;
+                j = k - i;
+                acc_hi = 0;
+                do {
+////////////////////////
+//                      uint64_t m = ((uint64_t)a[i]) * b[j];
+//                      acc_hi:acch:accl += m;
+                        asm volatile (
+                        // a[i] is already loaded in %%eax
+"\n                     mull    %7"
+"\n                     addl    %%eax, %0"
+"\n                     adcl    %%edx, %1"
+"\n                     adcl    $0, %2"
+                        : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
+                        : "0" (accl), "1" (acch), "2" (acc_hi), "a" (a[i]), "m" (b[j])
+                        : "cc", "dx"
+                        );
+////////////////////////
+                        j--;
+                        i++;
+                } while (i != 8 && i <= k);
+                rr[k] = accl;
+                accl = acch;
+                acch = acc_hi;
+        }
+        r[15] = accl;
+        memcpy(r, rr, sizeof(rr));
+#elif ALLOW_ASM && defined(__GNUC__) && defined(__x86_64__)
+        /* x86_64 has no alignment restrictions, and is little-endian,
+         * so 64-bit and 32-bit representations are identical */
+        const uint64_t* aa = (const void*)a;
+        const uint64_t* bb = (const void*)b;
+        uint64_t rr[8];
+        int k;
+        uint64_t accl;
+        uint64_t acch;
+        acch = accl = 0;
+        for (k = 0; k < 7; k++) {
+                int i, j;
+                uint64_t acc_hi;
+                i = k - 3;
+                if (i < 0)
+                        i = 0;
+                j = k - i;
+                acc_hi = 0;
+                do {
+////////////////////////
+//                      uint128_t m = ((uint128_t)a[i]) * b[j];
+//                      acc_hi:acch:accl += m;
+                        asm volatile (
+                        // aa[i] is already loaded in %%rax
+"\n                     mulq    %7"
+"\n                     addq    %%rax, %0"
+"\n                     adcq    %%rdx, %1"
+"\n                     adcq    $0, %2"
+                        : "=rm" (accl), "=rm" (acch), "=rm" (acc_hi)
+                        : "0" (accl), "1" (acch), "2" (acc_hi), "a" (aa[i]), "m" (bb[j])
+                        : "cc", "dx"
+                        );
+////////////////////////
+                        j--;
+                        i++;
+                } while (i != 4 && i <= k);
+                rr[k] = accl;
+                accl = acch;
+                acch = acc_hi;
+        }
+        rr[7] = accl;
+        memcpy(r, rr, sizeof(rr));
+#elif 0
+        //TODO: arm assembly (untested)
+        sp_digit tmp[16];
+        asm volatile (
+"\n             mov     r5, #0"
+"\n             mov     r6, #0"
+"\n             mov     r7, #0"
+"\n             mov     r8, #0"
+"\n     1:"
+"\n             subs    r3, r5, #28"
+"\n             movcc   r3, #0"
+"\n             sub     r4, r5, r3"
+"\n     2:"
+"\n             ldr     r14, [%[a], r3]"
+"\n             ldr     r12, [%[b], r4]"
+"\n             umull   r9, r10, r14, r12"
+"\n             adds    r6, r6, r9"
+"\n             adcs    r7, r7, r10"
+"\n             adc     r8, r8, #0"
+"\n             add     r3, r3, #4"
+"\n             sub     r4, r4, #4"
+"\n             cmp     r3, #32"
+"\n             beq     3f"
+"\n             cmp     r3, r5"
+"\n             ble     2b"
+"\n     3:"
+"\n             str     r6, [%[r], r5]"
+"\n             mov     r6, r7"
+"\n             mov     r7, r8"
+"\n             mov     r8, #0"
+"\n             add     r5, r5, #4"
+"\n             cmp     r5, #56"
+"\n             ble     1b"
+"\n             str     r6, [%[r], r5]"
+                : [r] "r" (tmp), [a] "r" (a), [b] "r" (b)
+                : "memory", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
+        );
+        memcpy(r, tmp, sizeof(tmp));
+#else
+        sp_digit rr[15]; /* in case r coincides with a or b */
        int i, j, k;
-        int64_t c;
+        uint64_t acc;
-        c = ((int64_t)a[9]) * b[9];
+        acc = 0;
-        r[19] = (sp_digit)(c >> 26);
+        for (k = 0; k < 15; k++) {
-        c = (c & 0x3ffffff) << 26;
+                uint32_t acc_hi;
-        for (k = 17; k >= 0; k--) {
+                i = k - 7;
-                for (i = 9; i >= 0; i--) {
+                if (i < 0)
-                        j = k - i;
+                        i = 0;
-                        if (j >= 10)
+                j = k - i;
-                                break;
+                acc_hi = 0;
-                        if (j < 0)
+                do {
-                                continue;
+                        uint64_t m = ((uint64_t)a[i]) * b[j];
-                        c += ((int64_t)a[i]) * b[j];
+                        acc += m;
-                }
+                        if (acc < m)
-                r[k + 2] += c >> 52;
+                                acc_hi++;
-                r[k + 1] = (c >> 26) & 0x3ffffff;
+                        j--;
-                c = (c & 0x3ffffff) << 26;
+                        i++;
+                } while (i != 8 && i <= k);
+                rr[k] = acc;
+                acc = (acc >> 32) | ((uint64_t)acc_hi << 32);
        }
-        r[0] = (sp_digit)(c >> 26);
+        r[15] = acc;
+        memcpy(r, rr, sizeof(rr));
+#endif
 }
-/* Square a and put result in r. (r = a * a) */
+/* Shift number right one bit. Bottom bit is lost. */
-static void sp_256_sqr_10(sp_digit* r, const sp_digit* a)
+static void sp_256_rshift1_8(sp_digit* r, sp_digit* a, sp_digit carry)
 {
-        int i, j, k;
+        int i;
-        int64_t c;
+        carry = (!!carry << 31);
-        c = ((int64_t)a[9]) * a[9];
+        for (i = 7; i >= 0; i--) {
-        r[19] = (sp_digit)(c >> 26);
+                sp_digit c = a[i] << 31;
-        c = (c & 0x3ffffff) << 26;
+                r[i] = (a[i] >> 1) | carry;
-        for (k = 17; k >= 0; k--) {
+                carry = c;
-                for (i = 9; i >= 0; i--) {
-                        j = k - i;
-                        if (j >= 10 || i <= j)
-                                break;
-                        if (j < 0)
-                                continue;
-                        c += ((int64_t)a[i]) * a[j] * 2;
-                }
-                if (i == j)
-                        c += ((int64_t)a[i]) * a[i];
-                r[k + 2] += c >> 52;
-                r[k + 1] = (c >> 26) & 0x3ffffff;
-                c = (c & 0x3ffffff) << 26;
        }
-        r[0] = (sp_digit)(c >> 26);
 }
 /* Divide the number by 2 mod the modulus (prime). (r = a / 2 % m) */
-static void sp_256_div2_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_256_div2_8(sp_digit* r, const sp_digit* a, const sp_digit* m)
 {
+        int carry = 0;
        if (a[0] & 1)
-                sp_256_add_10(r, a, m);
+                carry = sp_256_add_8(r, a, m);
-        sp_256_norm_10(r);
+        sp_256_norm_8(r);
-        sp_256_rshift1_10(r, r);
+        sp_256_rshift1_8(r, r, carry);
 }
 /* Add two Montgomery form numbers (r = a + b % m) */
-static void sp_256_mont_add_10(sp_digit* r, const sp_digit* a, const sp_digit* b,
+static void sp_256_mont_add_8(sp_digit* r, const sp_digit* a, const sp_digit* b
-                const sp_digit* m)
+                /*, const sp_digit* m*/)
 {
-        sp_256_add_10(r, a, b);
+//      const sp_digit* m = p256_mod;
-        sp_256_norm_10(r);
-        if ((r[9] >> 22) > 0)
+        int carry = sp_256_add_8(r, a, b);
-                sp_256_sub_10(r, r, m);
+        sp_256_norm_8(r);
-        sp_256_norm_10(r);
+        if (carry) {
+                sp_256_sub_8_p256_mod(r);
+                sp_256_norm_8(r);
+        }
 }
 /* Subtract two Montgomery form numbers (r = a - b % m) */
-static void sp_256_mont_sub_10(sp_digit* r, const sp_digit* a, const sp_digit* b,
+static void sp_256_mont_sub_8(sp_digit* r, const sp_digit* a, const sp_digit* b
-                const sp_digit* m)
+                /*, const sp_digit* m*/)
 {
-        sp_256_sub_10(r, a, b);
+        const sp_digit* m = p256_mod;
-        if (r[9] >> 22)
-                sp_256_add_10(r, r, m);
+        int borrow;
-        sp_256_norm_10(r);
+        borrow = sp_256_sub_8(r, a, b);
+        sp_256_norm_8(r);
+        if (borrow) {
+                sp_256_add_8(r, r, m);
+                sp_256_norm_8(r);
+        }
 }
 /* Double a Montgomery form number (r = a + a % m) */
-static void sp_256_mont_dbl_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_256_mont_dbl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* m*/)
 {
-        sp_256_add_10(r, a, a);
+//      const sp_digit* m = p256_mod;
-        sp_256_norm_10(r);
-        if ((r[9] >> 22) > 0)
+        int carry = sp_256_add_8(r, a, a);
-                sp_256_sub_10(r, r, m);
+        sp_256_norm_8(r);
-        sp_256_norm_10(r);
+        if (carry)
+                sp_256_sub_8_p256_mod(r);
+        sp_256_norm_8(r);
 }
 /* Triple a Montgomery form number (r = a + a + a % m) */
-static void sp_256_mont_tpl_10(sp_digit* r, const sp_digit* a, const sp_digit* m)
+static void sp_256_mont_tpl_8(sp_digit* r, const sp_digit* a /*, const sp_digit* m*/)
 {
-        sp_256_add_10(r, a, a);
+//      const sp_digit* m = p256_mod;
-        sp_256_norm_10(r);
-        if ((r[9] >> 22) > 0)
+        int carry = sp_256_add_8(r, a, a);
-                sp_256_sub_10(r, r, m);
+        sp_256_norm_8(r);
-        sp_256_norm_10(r);
+        if (carry) {
-        sp_256_add_10(r, r, a);
+                sp_256_sub_8_p256_mod(r);
-        sp_256_norm_10(r);
+                sp_256_norm_8(r);
-        if ((r[9] >> 22) > 0)
+        }
-                sp_256_sub_10(r, r, m);
+        carry = sp_256_add_8(r, r, a);
-        sp_256_norm_10(r);
+        sp_256_norm_8(r);
+        if (carry) {
+                sp_256_sub_8_p256_mod(r);
+                sp_256_norm_8(r);
+        }
 }
 /* Shift the result in the high 256 bits down to the bottom. */
-static void sp_256_mont_shift_10(sp_digit* r, const sp_digit* a)
+static void sp_256_mont_shift_8(sp_digit* r, const sp_digit* a)
 {
        int i;
-        sp_digit n, s;
+        for (i = 0; i < 8; i++) {
-        s = a[10];
+                r[i] = a[i+8];
-        n = a[9] >> 22;
+                r[i+8] = 0;
-        for (i = 0; i < 9; i++) {
-                n += (s & 0x3ffffff) << 4;
-                r[i] = n & 0x3ffffff;
-                n >>= 26;
-                s = a[11 + i] + (s >> 26);
        }
-        n += s << 4;
+}
-        r[9] = n;
-        memset(&r[10], 0, sizeof(*r) * 10);
+/* Mul a by scalar b and add into r. (r += a * b) */
+static int sp_256_mul_add_8(sp_digit* r /*, const sp_digit* a, sp_digit b*/)
+{
+//      const sp_digit* a = p256_mod;
+//a[7..0] = ffffffff 00000001 00000000 00000000 00000000 ffffffff ffffffff ffffffff
+        sp_digit b = r[0];
+        uint64_t t;
+//      t = 0;
+//      for (i = 0; i < 8; i++) {
+//              uint32_t t_hi;
+//              uint64_t m = ((uint64_t)b * a[i]) + r[i];
+//              t += m;
+//              t_hi = (t < m);
+//              r[i] = (sp_digit)t;
+//              t = (t >> 32) | ((uint64_t)t_hi << 32);
+//      }
+//      r[8] += (sp_digit)t;
+        // Unroll, then optimize the above loop:
+                //uint32_t t_hi;
+                uint64_t m;
+                uint32_t t32;
+                //m = ((uint64_t)b * a[0]) + r[0];
+                //  Since b is r[0] and a[0] is ffffffff, the above optimizes to:
+                //  m = r[0] * ffffffff + r[0] = (r[0] * 100000000 - r[0]) + r[0] = r[0] << 32;
+                //t += m;
+                //  t = r[0] << 32 = b << 32;
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[0] = (sp_digit)t;
+                r[0] = 0;
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                //  t = b;
+                //m = ((uint64_t)b * a[1]) + r[1];
+                //  Since a[1] is ffffffff, the above optimizes to:
+                //  m = b * ffffffff + r[1] = (b * 100000000 - b) + r[1] = (b << 32) - b + r[1];
+                //t += m;
+                //  t = b + (b << 32) - b + r[1] = (b << 32) + r[1];
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[1] = (sp_digit)t;
+                //  r[1] = r[1];
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                //  t = b;
+                //m = ((uint64_t)b * a[2]) + r[2];
+                //  Since a[2] is ffffffff, the above optimizes to:
+                //  m = b * ffffffff + r[2] = (b * 100000000 - b) + r[2] = (b << 32) - b + r[2];
+                //t += m;
+                //  t = b + (b << 32) - b + r[2] = (b << 32) + r[2]
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[2] = (sp_digit)t;
+                //  r[2] = r[2];
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                //  t = b;
+                //m = ((uint64_t)b * a[3]) + r[3];
+                //  Since a[3] is 00000000, the above optimizes to:
+                //  m = b * 0 + r[3] = r[3];
+                //t += m;
+                //  t = b + r[3];
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[3] = (sp_digit)t;
+                r[3] = r[3] + b;
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                t32 = (r[3] < b); // 0 or 1
+                //m = ((uint64_t)b * a[4]) + r[4];
+                //  Since a[4] is 00000000, the above optimizes to:
+                //  m = b * 0 + r[4] = r[4];
+                //t += m;
+                //  t = t32 + r[4];
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[4] = (sp_digit)t;
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                if (t32 != 0) {
+                        r[4]++;
+                        t32 = (r[4] == 0); // 0 or 1
+                //m = ((uint64_t)b * a[5]) + r[5];
+                //  Since a[5] is 00000000, the above optimizes to:
+                //  m = b * 0 + r[5] = r[5];
+                //t += m;
+                //  t = t32 + r[5]; (t32 is 0 or 1)
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                //r[5] = (sp_digit)t;
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                        if (t32 != 0) {
+                                r[5]++;
+                                t32 = (r[5] == 0); // 0 or 1
+                        }
+                }
+                //m = ((uint64_t)b * a[6]) + r[6];
+                //  Since a[6] is 00000001, the above optimizes to:
+                //  m = (uint64_t)b + r[6]; // 33 bits at most
+                //t += m;
+                t = t32 + (uint64_t)b + r[6];
+                //t_hi = (t < m);
+                //  t_hi = 0;
+                r[6] = (sp_digit)t;
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                t = (t >> 32);
+                //m = ((uint64_t)b * a[7]) + r[7];
+                //  Since a[7] is ffffffff, the above optimizes to:
+                //  m = b * ffffffff + r[7] = (b * 100000000 - b) + r[7]
+                m = ((uint64_t)b << 32) - b + r[7];
+                t += m;
+                //t_hi = (t < m);
+                //  t_hi in fact is always 0 here (256bit * 32bit can't have more than 32 bits of overflow)
+                r[7] = (sp_digit)t;
+                //t = (t >> 32) | ((uint64_t)t_hi << 32);
+                t = (t >> 32);
+        r[8] += (sp_digit)t;
+        return (r[8] < (sp_digit)t); /* 1 if addition overflowed */
 }
 /* Reduce the number back to 256 bits using Montgomery reduction.
@@ -355,39 +777,159 @@ static void sp_256_mont_shift_10(sp_digit* r, const sp_digit* a)
 * m   The single precision number representing the modulus.
 * mp  The digit representing the negative inverse of m mod 2^n.
 */
-static void sp_256_mont_reduce_10(sp_digit* a, const sp_digit* m, sp_digit mp)
+static void sp_256_mont_reduce_8(sp_digit* a/*, const sp_digit* m, sp_digit mp*/)
 {
+//      const sp_digit* m = p256_mod;
+        sp_digit mp = p256_mp_mod;
        int i;
-        sp_digit mu;
+//      sp_digit mu;
        if (mp != 1) {
-                for (i = 0; i < 9; i++) {
+                sp_digit word16th = 0;
-                        mu = (a[i] * mp) & 0x3ffffff;
+                for (i = 0; i < 8; i++) {
-                        sp_256_mul_add_10(a+i, m, mu);
+//                      mu = (sp_digit)(a[i] * mp);
-                        a[i+1] += a[i] >> 26;
+                        if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
+                                int j = i + 8;
+ inc_next_word0:
+                                if (++j > 15) { /* a[16] array has no more words? */
+                                        word16th++;
+                                        continue;
+                                }
+                                if (++a[j] == 0) /* did this overflow too? */
+                                        goto inc_next_word0;
+                        }
                }
-                mu = (a[i] * mp) & 0x3fffffl;
+                sp_256_mont_shift_8(a, a);
-                sp_256_mul_add_10(a+i, m, mu);
+                if (word16th != 0)
-                a[i+1] += a[i] >> 26;
+                        sp_256_sub_8_p256_mod(a);
-                a[i] &= 0x3ffffff;
+                sp_256_norm_8(a);
        }
-        else {
+        else { /* Same code for explicit mp == 1 (which is always the case for P256) */
-                for (i = 0; i < 9; i++) {
+                sp_digit word16th = 0;
-                        mu = a[i] & 0x3ffffff;
+                for (i = 0; i < 8; i++) {
-                        sp_256_mul_add_10(a+i, p256_mod, mu);
+                        /*mu = a[i];*/
-                        a[i+1] += a[i] >> 26;
+                        if (sp_256_mul_add_8(a+i /*, m, mu*/)) {
+                                int j = i + 8;
+ inc_next_word:
+                                if (++j > 15) { /* a[16] array has no more words? */
+                                        word16th++;
+                                        continue;
+                                }
+                                if (++a[j] == 0) /* did this overflow too? */
+                                        goto inc_next_word;
+                        }
                }
-                mu = a[i] & 0x3fffffl;
+                sp_256_mont_shift_8(a, a);
-                sp_256_mul_add_10(a+i, p256_mod, mu);
+                if (word16th != 0)
-                a[i+1] += a[i] >> 26;
+                        sp_256_sub_8_p256_mod(a);
-                a[i] &= 0x3ffffff;
+                sp_256_norm_8(a);
        }
-        sp_256_mont_shift_10(a, a);
-        if ((a[9] >> 22) > 0)
-                sp_256_sub_10(a, a, m);
-        sp_256_norm_10(a);
 }
+#if 0
+//TODO: arm32 asm (also adapt for x86?)
+static void sp_256_mont_reduce_8(sp_digit* a, sp_digit* m, sp_digit mp)
+{
+        sp_digit ca = 0;
+        asm volatile (
+        # i = 0
+        mov     r12, #0
+        ldr     r10, [%[a], #0]
+        ldr     r14, [%[a], #4]
+1:
+        # mu = a[i] * mp
+        mul     r8, %[mp], r10
+        # a[i+0] += m[0] * mu
+        ldr     r7, [%[m], #0]
+        ldr     r9, [%[a], #0]
+        umull   r6, r7, r8, r7
+        adds    r10, r10, r6
+        adc     r5, r7, #0
+        # a[i+1] += m[1] * mu
+        ldr     r7, [%[m], #4]
+        ldr     r9, [%[a], #4]
+        umull   r6, r7, r8, r7
+        adds    r10, r14, r6
+        adc     r4, r7, #0
+        adds    r10, r10, r5
+        adc     r4, r4, #0
+        # a[i+2] += m[2] * mu
+        ldr     r7, [%[m], #8]
+        ldr     r14, [%[a], #8]
+        umull   r6, r7, r8, r7
+        adds    r14, r14, r6
+        adc     r5, r7, #0
+        adds    r14, r14, r4
+        adc     r5, r5, #0
+        # a[i+3] += m[3] * mu
+        ldr     r7, [%[m], #12]
+        ldr     r9, [%[a], #12]
+        umull   r6, r7, r8, r7
+        adds    r9, r9, r6
+        adc     r4, r7, #0
+        adds    r9, r9, r5
+        str     r9, [%[a], #12]
+        adc     r4, r4, #0
+        # a[i+4] += m[4] * mu
+        ldr     r7, [%[m], #16]
+        ldr     r9, [%[a], #16]
+        umull   r6, r7, r8, r7
+        adds    r9, r9, r6
+        adc     r5, r7, #0
+        adds    r9, r9, r4
+        str     r9, [%[a], #16]
+        adc     r5, r5, #0
+        # a[i+5] += m[5] * mu
+        ldr     r7, [%[m], #20]
+        ldr     r9, [%[a], #20]
+        umull   r6, r7, r8, r7
+        adds    r9, r9, r6
+        adc     r4, r7, #0
+        adds    r9, r9, r5
+        str     r9, [%[a], #20]
+        adc     r4, r4, #0
+        # a[i+6] += m[6] * mu
+        ldr     r7, [%[m], #24]
+        ldr     r9, [%[a], #24]
+        umull   r6, r7, r8, r7
+        adds    r9, r9, r6
+        adc     r5, r7, #0
+        adds    r9, r9, r4
+        str     r9, [%[a], #24]
+        adc     r5, r5, #0
+        # a[i+7] += m[7] * mu
+        ldr     r7, [%[m], #28]
+        ldr     r9, [%[a], #28]
+        umull   r6, r7, r8, r7
+        adds    r5, r5, r6
+        adcs    r7, r7, %[ca]
+        mov     %[ca], #0
+        adc     %[ca], %[ca], %[ca]
+        adds    r9, r9, r5
+        str     r9, [%[a], #28]
+        ldr     r9, [%[a], #32]
+        adcs    r9, r9, r7
+        str     r9, [%[a], #32]
+        adc     %[ca], %[ca], #0
+        # i += 1
+        add     %[a], %[a], #4
+        add     r12, r12, #4
+        cmp     r12, #32
+        blt     1b
+        str     r10, [%[a], #0]
+        str     r14, [%[a], #4]
+        : [ca] "+r" (ca), [a] "+r" (a)
+        : [m] "r" (m), [mp] "r" (mp)
+        : "memory", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r12", "r14"
+        );
+        memcpy(a, a + 8, 32);
+        if (ca)
+                a -= m;
+}
+#endif
 /* Multiply two Montogmery form numbers mod the modulus (prime).
 * (r = a * b mod m)
@@ -398,11 +940,13 @@ static void sp_256_mont_reduce_10(sp_digit* a, const sp_digit* m, sp_digit mp)
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
-static void sp_256_mont_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b,
+static void sp_256_mont_mul_8(sp_digit* r, const sp_digit* a, const sp_digit* b
-                const sp_digit* m, sp_digit mp)
+                /*, const sp_digit* m, sp_digit mp*/)
 {
-        sp_256_mul_10(r, a, b);
+        //const sp_digit* m = p256_mod;
-        sp_256_mont_reduce_10(r, m, mp);
+        //sp_digit mp = p256_mp_mod;
+        sp_256_mul_8(r, a, b);
+        sp_256_mont_reduce_8(r /*, m, mp*/);
 }
 /* Square the Montgomery form number. (r = a * a mod m)
@@ -412,11 +956,12 @@ static void sp_256_mont_mul_10(sp_digit* r, const sp_digit* a, const sp_digit* b
 * m   Modulus (prime).
 * mp  Montogmery mulitplier.
 */
-static void sp_256_mont_sqr_10(sp_digit* r, const sp_digit* a, const sp_digit* m,
+static void sp_256_mont_sqr_8(sp_digit* r, const sp_digit* a
-                sp_digit mp)
+                /*, const sp_digit* m, sp_digit mp*/)
 {
-        sp_256_sqr_10(r, a);
+        //const sp_digit* m = p256_mod;
-        sp_256_mont_reduce_10(r, m, mp);
+        //sp_digit mp = p256_mp_mod;
+        sp_256_mont_mul_8(r, a, a /*, m, mp*/);
 }
 /* Invert the number, in Montgomery form, modulo the modulus (prime) of the
@@ -437,19 +982,19 @@ static const uint32_t p256_mod_2[8] = {
 //543210987654321098765432109876543210987654321098765432109876543210...09876543210...09876543210
 //111111111111111111111111111111110000000000000000000000000000000100...00000111111...11111111101
 #endif
-static void sp_256_mont_inv_10(sp_digit* r, sp_digit* a)
+static void sp_256_mont_inv_8(sp_digit* r, sp_digit* a)
 {
-        sp_digit t[2*10]; //can be just [10]?
+        sp_digit t[2*8]; //can be just [8]?
        int i;
-        memcpy(t, a, sizeof(sp_digit) * 10);
+        memcpy(t, a, sizeof(sp_digit) * 8);
        for (i = 254; i >= 0; i--) {
-                sp_256_mont_sqr_10(t, t, p256_mod, p256_mp_mod);
+                sp_256_mont_sqr_8(t, t /*, p256_mod, p256_mp_mod*/);
                /*if (p256_mod_2[i / 32] & ((sp_digit)1 << (i % 32)))*/
                if (i >= 224 || i == 192 || (i <= 95 && i != 1))
-                        sp_256_mont_mul_10(t, t, a, p256_mod, p256_mp_mod);
+                        sp_256_mont_mul_8(t, t, a /*, p256_mod, p256_mp_mod*/);
        }
-        memcpy(r, t, sizeof(sp_digit) * 10);
+        memcpy(r, t, sizeof(sp_digit) * 8);
 }
 /* Multiply a number by Montogmery normalizer mod modulus (prime).
@@ -457,93 +1002,29 @@ static void sp_256_mont_inv_10(sp_digit* r, sp_digit* a)
 * r  The resulting Montgomery form number.
 * a  The number to convert.
 */
-static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a)
+static void sp_256_mod_mul_norm_8(sp_digit* r, const sp_digit* a)
 {
        int64_t t[8];
-        int64_t o;
+        int32_t o;
-        uint32_t a32;
+#define A(n) ((uint64_t)a[n])
        /*  1  1  0 -1 -1 -1 -1  0 */
+        t[0] = 0 + A(0) + A(1) - A(3) - A(4) - A(5) - A(6);
        /*  0  1  1  0 -1 -1 -1 -1 */
+        t[1] = 0 + A(1) + A(2) - A(4) - A(5) - A(6) - A(7);
        /*  0  0  1  1  0 -1 -1 -1 */
+        t[2] = 0 + A(2) + A(3) - A(5) - A(6) - A(7);
        /* -1 -1  0  2  2  1  0 -1 */
+        t[3] = 0 - A(0) - A(1) + 2 * A(3) + 2 * A(4) + A(5) - A(7);
        /*  0 -1 -1  0  2  2  1  0 */
+        t[4] = 0 - A(1) - A(2) + 2 * A(4) + 2 * A(5) + A(6);
        /*  0  0 -1 -1  0  2  2  1 */
+        t[5] = 0 - A(2) - A(3) + 2 * A(5) + 2 * A(6) + A(7);
        /* -1 -1  0  0  0  1  3  2 */
+        t[6] = 0 - A(0) - A(1) + A(5) + 3 * A(6) + 2 * A(7);
        /*  1  0 -1 -1 -1 -1  0  3 */
-        // t[] should be calculated from "a" (converted from 26-bit to 32-bit vector a32[8])
+        t[7] = 0 + A(0) - A(2) - A(3) - A(4) - A(5) + 3 * A(7);
-        // according to the above matrix:
+#undef A
-        //t[0] = 0 + a32[0] + a32[1]            - a32[3]   - a32[4]   - a32[5]   - a32[6]             ;
-        //t[1] = 0          + a32[1] + a32[2]              - a32[4]   - a32[5]   - a32[6]   - a32[7]  ;
-        //t[2] = 0                   + a32[2]   + a32[3]              - a32[5]   - a32[6]   - a32[7]  ;
-        //t[3] = 0 - a32[0] - a32[1]            + 2*a32[3] + 2*a32[4] + a32[5]              - a32[7]  ;
-        //t[4] = 0          - a32[1] - a32[2]              + 2*a32[4] + 2*a32[5] + a32[6]             ;
-        //t[5] = 0                   - a32[2]   - a32[3]              + 2*a32[5] + 2*a32[6] + a32[7]  ;
-        //t[6] = 0 - a32[0] - a32[1]                                  + a32[5]   + 3*a32[6] + 2*a32[7];
-        //t[7] = 0 + a32[0]          - a32[2]   - a32[3]   - a32[4]   - a32[5]              + 3*a32[7];
-        // We can do it "piecemeal" after each a32[i] is known, no need to store entire a32[8] vector:
-#define A32 (int64_t)a32
-        a32 = a[0] | (a[1] << 26);
-        t[0] = 0 + A32;
-        t[3] = 0 - A32;
-        t[6] = 0 - A32;
-        t[7] = 0 + A32;
-        a32 = (a[1] >> 6) | (a[2] << 20);
-        t[0] += A32    ;
-        t[1]  = 0 + A32;
-        t[3] -= A32    ;
-        t[4]  = 0 - A32;
-        t[6] -= A32    ;
-        a32 = (a[2] >> 12) | (a[3] << 14);
-        t[1] += A32    ;
-        t[2]  = 0 + A32;
-        t[4] -= A32    ;
-        t[5]  = 0 - A32;
-        t[7] -= A32    ;
-        a32 = (a[3] >> 18) | (a[4] << 8);
-        t[0] -= A32  ;
-        t[2] += A32  ;
-        t[3] += 2*A32;
-        t[5] -= A32  ;
-        t[7] -= A32  ;
-        a32 = (a[4] >> 24) | (a[5] << 2) | (a[6] << 28);
-        t[0] -= A32  ;
-        t[1] -= A32  ;
-        t[3] += 2*A32;
-        t[4] += 2*A32;
-        t[7] -= A32  ;
-        a32 = (a[6] >> 4) | (a[7] << 22);
-        t[0] -= A32  ;
-        t[1] -= A32  ;
-        t[2] -= A32  ;
-        t[3] += A32  ;
-        t[4] += 2*A32;
-        t[5] += 2*A32;
-        t[6] += A32  ;
-        t[7] -= A32  ;
-        a32 = (a[7] >> 10) | (a[8] << 16);
-        t[0] -= A32  ;
-        t[1] -= A32  ;
-        t[2] -= A32  ;
-        t[4] += A32  ;
-        t[5] += 2*A32;
-        t[6] += 3*A32;
-        a32 = (a[8] >> 16) | (a[9] << 10);
-        t[1] -= A32  ;
-        t[2] -= A32  ;
-        t[3] -= A32  ;
-        t[5] += A32  ;
-        t[6] += 2*A32;
-        t[7] += 3*A32;
-#undef A32
        t[1] += t[0] >> 32; t[0] &= 0xffffffff;
        t[2] += t[1] >> 32; t[1] &= 0xffffffff;
@@ -552,29 +1033,27 @@ static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a)
        t[5] += t[4] >> 32; t[4] &= 0xffffffff;
        t[6] += t[5] >> 32; t[5] &= 0xffffffff;
        t[7] += t[6] >> 32; t[6] &= 0xffffffff;
-        o     = t[7] >> 32; t[7] &= 0xffffffff;
+        o     = t[7] >> 32; //t[7] &= 0xffffffff;
        t[0] += o;
        t[3] -= o;
        t[6] -= o;
        t[7] += o;
-        t[1] += t[0] >> 32; //t[0] &= 0xffffffff;
+        r[0] = (sp_digit)t[0];
-        t[2] += t[1] >> 32; //t[1] &= 0xffffffff;
+        t[1] += t[0] >> 32;
-        t[3] += t[2] >> 32; //t[2] &= 0xffffffff;
+        r[1] = (sp_digit)t[1];
-        t[4] += t[3] >> 32; //t[3] &= 0xffffffff;
+        t[2] += t[1] >> 32;
-        t[5] += t[4] >> 32; //t[4] &= 0xffffffff;
+        r[2] = (sp_digit)t[2];
-        t[6] += t[5] >> 32; //t[5] &= 0xffffffff;
+        t[3] += t[2] >> 32;
-        t[7] += t[6] >> 32; //t[6] &= 0xffffffff; - (uint32_t)t[i] casts below accomplish masking
+        r[3] = (sp_digit)t[3];
+        t[4] += t[3] >> 32;
-        r[0] = 0x3ffffff & ((sp_digit)((uint32_t)t[0]));
+        r[4] = (sp_digit)t[4];
-        r[1] = 0x3ffffff & ((sp_digit)((uint32_t)t[0] >> 26) | ((sp_digit)t[1] <<  6));
+        t[5] += t[4] >> 32;
-        r[2] = 0x3ffffff & ((sp_digit)((uint32_t)t[1] >> 20) | ((sp_digit)t[2] << 12));
+        r[5] = (sp_digit)t[5];
-        r[3] = 0x3ffffff & ((sp_digit)((uint32_t)t[2] >> 14) | ((sp_digit)t[3] << 18));
+        t[6] += t[5] >> 32;
-        r[4] = 0x3ffffff & ((sp_digit)((uint32_t)t[3] >>  8) | ((sp_digit)t[4] << 24));
+        r[6] = (sp_digit)t[6];
-        r[5] = 0x3ffffff & ((sp_digit)((uint32_t)t[4] >>  2));
+//      t[7] += t[6] >> 32;
-        r[6] = 0x3ffffff & ((sp_digit)((uint32_t)t[4] >> 28) | ((sp_digit)t[5] <<  4));
+//      r[7] = (sp_digit)t[7];
-        r[7] = 0x3ffffff & ((sp_digit)((uint32_t)t[5] >> 22) | ((sp_digit)t[6] << 10));
+        r[7] = (sp_digit)t[7] + (sp_digit)(t[6] >> 32);
-        r[8] = 0x3ffffff & ((sp_digit)((uint32_t)t[6] >> 16) | ((sp_digit)t[7] << 16));
-        r[9] =             ((sp_digit)((uint32_t)t[7] >> 10));
 }
 /* Map the Montgomery form projective co-ordinate point to an affine point.
@@ -582,33 +1061,33 @@ static void sp_256_mod_mul_norm_10(sp_digit* r, const sp_digit* a)
 * r  Resulting affine co-ordinate point.
 * p  Montgomery form projective co-ordinate point.
 */
-static void sp_256_map_10(sp_point* r, sp_point* p)
+static void sp_256_map_8(sp_point* r, sp_point* p)
 {
-        sp_digit t1[2*10];
+        sp_digit t1[2*8];
-        sp_digit t2[2*10];
+        sp_digit t2[2*8];
-        sp_256_mont_inv_10(t1, p->z);
+        sp_256_mont_inv_8(t1, p->z);
-        sp_256_mont_sqr_10(t2, t1, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(t2, t1 /*, p256_mod, p256_mp_mod*/);
-        sp_256_mont_mul_10(t1, t2, t1, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(t1, t2, t1 /*, p256_mod, p256_mp_mod*/);
        /* x /= z^2 */
-        sp_256_mont_mul_10(r->x, p->x, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(r->x, p->x, t2 /*, p256_mod, p256_mp_mod*/);
-        memset(r->x + 10, 0, sizeof(r->x) / 2);
+        memset(r->x + 8, 0, sizeof(r->x) / 2);
-        sp_256_mont_reduce_10(r->x, p256_mod, p256_mp_mod);
+        sp_256_mont_reduce_8(r->x /*, p256_mod, p256_mp_mod*/);
        /* Reduce x to less than modulus */
-        if (sp_256_cmp_10(r->x, p256_mod) >= 0)
+        if (sp_256_cmp_8(r->x, p256_mod) >= 0)
-                sp_256_sub_10(r->x, r->x, p256_mod);
+                sp_256_sub_8_p256_mod(r->x);
-        sp_256_norm_10(r->x);
+        sp_256_norm_8(r->x);
        /* y /= z^3 */
-        sp_256_mont_mul_10(r->y, p->y, t1, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(r->y, p->y, t1 /*, p256_mod, p256_mp_mod*/);
-        memset(r->y + 10, 0, sizeof(r->y) / 2);
+        memset(r->y + 8, 0, sizeof(r->y) / 2);
-        sp_256_mont_reduce_10(r->y, p256_mod, p256_mp_mod);
+        sp_256_mont_reduce_8(r->y /*, p256_mod, p256_mp_mod*/);
        /* Reduce y to less than modulus */
-        if (sp_256_cmp_10(r->y, p256_mod) >= 0)
+        if (sp_256_cmp_8(r->y, p256_mod) >= 0)
-                sp_256_sub_10(r->y, r->y, p256_mod);
+                sp_256_sub_8_p256_mod(r->y);
-        sp_256_norm_10(r->y);
+        sp_256_norm_8(r->y);
        memset(r->z, 0, sizeof(r->z));
        r->z[0] = 1;
@@ -619,56 +1098,62 @@ static void sp_256_map_10(sp_point* r, sp_point* p)
 * r  Result of doubling point.
 * p  Point to double.
 */
-static void sp_256_proj_point_dbl_10(sp_point* r, sp_point* p)
+static void sp_256_proj_point_dbl_8(sp_point* r, sp_point* p)
 {
-        sp_point tp;
+        sp_digit t1[2*8];
-        sp_digit t1[2*10];
+        sp_digit t2[2*8];
-        sp_digit t2[2*10];
        /* Put point to double into result */
        if (r != p)
                *r = *p; /* struct copy */
-        if (r->infinity) {
+        if (r->infinity)
-                /* If infinity, don't double (work on dummy value) */
+                return;
-                r = &tp;
+        if (SP_DEBUG) {
+                /* unused part of t2, may result in spurios
+                 * differences in debug output. Clear it.
+                 */
+                memset(t2, 0, sizeof(t2));
        }
        /* T1 = Z * Z */
-        sp_256_mont_sqr_10(t1, r->z, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(t1, r->z /*, p256_mod, p256_mp_mod*/);
        /* Z = Y * Z */
-        sp_256_mont_mul_10(r->z, r->y, r->z, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(r->z, r->y, r->z /*, p256_mod, p256_mp_mod*/);
        /* Z = 2Z */
-        sp_256_mont_dbl_10(r->z, r->z, p256_mod);
+        sp_256_mont_dbl_8(r->z, r->z /*, p256_mod*/);
        /* T2 = X - T1 */
-        sp_256_mont_sub_10(t2, r->x, t1, p256_mod);
+        sp_256_mont_sub_8(t2, r->x, t1 /*, p256_mod*/);
        /* T1 = X + T1 */
-        sp_256_mont_add_10(t1, r->x, t1, p256_mod);
+        sp_256_mont_add_8(t1, r->x, t1 /*, p256_mod*/);
        /* T2 = T1 * T2 */
-        sp_256_mont_mul_10(t2, t1, t2, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(t2, t1, t2 /*, p256_mod, p256_mp_mod*/);
        /* T1 = 3T2 */
-        sp_256_mont_tpl_10(t1, t2, p256_mod);
+        sp_256_mont_tpl_8(t1, t2 /*, p256_mod*/);
        /* Y = 2Y */
-        sp_256_mont_dbl_10(r->y, r->y, p256_mod);
+        sp_256_mont_dbl_8(r->y, r->y /*, p256_mod*/);
        /* Y = Y * Y */
-        sp_256_mont_sqr_10(r->y, r->y, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(r->y, r->y /*, p256_mod, p256_mp_mod*/);
        /* T2 = Y * Y */
-        sp_256_mont_sqr_10(t2, r->y, p256_mod, p256_mp_mod);
+        sp_256_mont_sqr_8(t2, r->y /*, p256_mod, p256_mp_mod*/);
        /* T2 = T2/2 */
-        sp_256_div2_10(t2, t2, p256_mod);
+        sp_256_div2_8(t2, t2, p256_mod);
        /* Y = Y * X */
-        sp_256_mont_mul_10(r->y, r->y, r->x, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(r->y, r->y, r->x /*, p256_mod, p256_mp_mod*/);
        /* X = T1 * T1 */
-        sp_256_mont_mul_10(r->x, t1, t1, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(r->x, t1, t1 /*, p256_mod, p256_mp_mod*/);
        /* X = X - Y */
-        sp_256_mont_sub_10(r->x, r->x, r->y, p256_mod);
+        sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
        /* X = X - Y */
-        sp_256_mont_sub_10(r->x, r->x, r->y, p256_mod);
+        sp_256_mont_sub_8(r->x, r->x, r->y /*, p256_mod*/);
        /* Y = Y - X */
-        sp_256_mont_sub_10(r->y, r->y, r->x, p256_mod);
+        sp_256_mont_sub_8(r->y, r->y, r->x /*, p256_mod*/);
        /* Y = Y * T1 */
-        sp_256_mont_mul_10(r->y, r->y, t1, p256_mod, p256_mp_mod);
+        sp_256_mont_mul_8(r->y, r->y, t1 /*, p256_mod, p256_mp_mod*/);
        /* Y = Y - T2 */
-        sp_256_mont_sub_10(r->y, r->y, t2, p256_mod);
+        sp_256_mont_sub_8(r->y, r->y, t2 /*, p256_mod*/);
+        dump_512("y2 %s\n", r->y);
 }
 /* Add two Montgomery form projective points.
@@ -677,13 +1162,13 @@ static void sp_256_proj_point_dbl_10(sp_point* r, sp_point* p)
 * p  Frist point to add.
 * q  Second point to add.
 */
-static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q)
+static NOINLINE void sp_256_proj_point_add_8(sp_point* r, sp_point* p, sp_point* q)
 {
-        sp_digit t1[2*10];
+        sp_digit t1[2*8];
-        sp_digit t2[2*10];
+        sp_digit t2[2*8];
-        sp_digit t3[2*10];
+        sp_digit t3[2*8];
-        sp_digit t4[2*10];
+        sp_digit t4[2*8];
-        sp_digit t5[2*10];
+        sp_digit t5[2*8];
        /* Ensure only the first point is the same as the result. */
        if (q == r) {
@@ -693,13 +1178,13 @@ static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q)
        }
        /* Check double */
-        sp_256_sub_10(t1, p256_mod, q->y);
+        sp_256_sub_8(t1, p256_mod, q->y);
-        sp_256_norm_10(t1);
+        sp_256_norm_8(t1);
-        if (sp_256_cmp_equal_10(p->x, q->x)
+        if (sp_256_cmp_equal_8(p->x, q->x)
-         && sp_256_cmp_equal_10(p->z, q->z)
+         && sp_256_cmp_equal_8(p->z, q->z)
-         && (sp_256_cmp_equal_10(p->y, q->y) || sp_256_cmp_equal_10(p->y, t1))
+         && (sp_256_cmp_equal_8(p->y, q->y) || sp_256_cmp_equal_8(p->y, t1))
        ) {
-                sp_256_proj_point_dbl_10(r, p);
+                sp_256_proj_point_dbl_8(r, p);
        }
        else {
                sp_point tp;
@@ -714,37 +1199,37 @@ static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q)
                *r = p->infinity ? *q : *p; /* struct copy */
                /* U1 = X1*Z2^2 */
-                sp_256_mont_sqr_10(t1, q->z, p256_mod, p256_mp_mod);
+                sp_256_mont_sqr_8(t1, q->z /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(t3, t1, q->z, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t3, t1, q->z /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(t1, t1, v->x, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t1, t1, v->x /*, p256_mod, p256_mp_mod*/);
                /* U2 = X2*Z1^2 */
-                sp_256_mont_sqr_10(t2, v->z, p256_mod, p256_mp_mod);
+                sp_256_mont_sqr_8(t2, v->z /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(t4, t2, v->z, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t4, t2, v->z /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(t2, t2, q->x, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t2, t2, q->x /*, p256_mod, p256_mp_mod*/);
                /* S1 = Y1*Z2^3 */
-                sp_256_mont_mul_10(t3, t3, v->y, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t3, t3, v->y /*, p256_mod, p256_mp_mod*/);
                /* S2 = Y2*Z1^3 */
-                sp_256_mont_mul_10(t4, t4, q->y, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t4, t4, q->y /*, p256_mod, p256_mp_mod*/);
                /* H = U2 - U1 */
-                sp_256_mont_sub_10(t2, t2, t1, p256_mod);
+                sp_256_mont_sub_8(t2, t2, t1 /*, p256_mod*/);
                /* R = S2 - S1 */
-                sp_256_mont_sub_10(t4, t4, t3, p256_mod);
+                sp_256_mont_sub_8(t4, t4, t3 /*, p256_mod*/);
                /* Z3 = H*Z1*Z2 */
-                sp_256_mont_mul_10(v->z, v->z, q->z, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(v->z, v->z, q->z /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(v->z, v->z, t2, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(v->z, v->z, t2 /*, p256_mod, p256_mp_mod*/);
                /* X3 = R^2 - H^3 - 2*U1*H^2 */
-                sp_256_mont_sqr_10(v->x, t4, p256_mod, p256_mp_mod);
+                sp_256_mont_sqr_8(v->x, t4 /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_sqr_10(t5, t2, p256_mod, p256_mp_mod);
+                sp_256_mont_sqr_8(t5, t2 /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(v->y, t1, t5, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(v->y, t1, t5 /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(t5, t5, t2, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t5, t5, t2 /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_sub_10(v->x, v->x, t5, p256_mod);
+                sp_256_mont_sub_8(v->x, v->x, t5 /*, p256_mod*/);
-                sp_256_mont_dbl_10(t1, v->y, p256_mod);
+                sp_256_mont_dbl_8(t1, v->y /*, p256_mod*/);
-                sp_256_mont_sub_10(v->x, v->x, t1, p256_mod);
+                sp_256_mont_sub_8(v->x, v->x, t1 /*, p256_mod*/);
                /* Y3 = R*(U1*H^2 - X3) - S1*H^3 */
-                sp_256_mont_sub_10(v->y, v->y, v->x, p256_mod);
+                sp_256_mont_sub_8(v->y, v->y, v->x /*, p256_mod*/);
-                sp_256_mont_mul_10(v->y, v->y, t4, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(v->y, v->y, t4 /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_mul_10(t5, t5, t3, p256_mod, p256_mp_mod);
+                sp_256_mont_mul_8(t5, t5, t3 /*, p256_mod, p256_mp_mod*/);
-                sp_256_mont_sub_10(v->y, v->y, t5, p256_mod);
+                sp_256_mont_sub_8(v->y, v->y, t5 /*, p256_mod*/);
        }
 }
@@ -756,12 +1241,11 @@ static void sp_256_proj_point_add_10(sp_point* r, sp_point* p, sp_point* q)
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 */
-static void sp_256_ecc_mulmod_10(sp_point* r, const sp_point* g, const sp_digit* k /*, int map*/)
+static void sp_256_ecc_mulmod_8(sp_point* r, const sp_point* g, const sp_digit* k /*, int map*/)
 {
        enum { map = 1 }; /* we always convert result to affine coordinates */
        sp_point t[3];
-        sp_digit n;
+        sp_digit n = n; /* for compiler */
-        int i;
        int c, y;
        memset(t, 0, sizeof(t));
@@ -769,33 +1253,44 @@ static void sp_256_ecc_mulmod_10(sp_point* r, const sp_point* g, const sp_digit*
        /* t[0] = {0, 0, 1} * norm */
        t[0].infinity = 1;
        /* t[1] = {g->x, g->y, g->z} * norm */
-        sp_256_mod_mul_norm_10(t[1].x, g->x);
+        sp_256_mod_mul_norm_8(t[1].x, g->x);
-        sp_256_mod_mul_norm_10(t[1].y, g->y);
+        sp_256_mod_mul_norm_8(t[1].y, g->y);
-        sp_256_mod_mul_norm_10(t[1].z, g->z);
+        sp_256_mod_mul_norm_8(t[1].z, g->z);
-        i = 9;
-        c = 22;
-        n = k[i--] << (26 - c);
-        for (; ; c--) {
-                if (c == 0) {
-                        if (i == -1)
-                                break;
-                        n = k[i--];
+        /* For every bit, starting from most significant... */
-                        c = 26;
+        k += 7;
+        c = 256;
+        for (;;) {
+                if ((c & 0x1f) == 0) {
+                        if (c == 0)
+                                break;
+                        n = *k--;
                }
-                y = (n >> 25) & 1;
+                y = (n >> 31);
-                n <<= 1;
+                dbg("y:%d t[%d] = t[0]+t[1]\n", y, y^1);
+                sp_256_proj_point_add_8(&t[y^1], &t[0], &t[1]);
-                sp_256_proj_point_add_10(&t[y^1], &t[0], &t[1]);
+                dump_512("t[0].x %s\n", t[0].x);
+                dump_512("t[0].y %s\n", t[0].y);
+                dump_512("t[0].z %s\n", t[0].z);
+                dump_512("t[1].x %s\n", t[1].x);
+                dump_512("t[1].y %s\n", t[1].y);
+                dump_512("t[1].z %s\n", t[1].z);
+                dbg("t[2] = t[%d]\n", y);
                memcpy(&t[2], &t[y], sizeof(sp_point));
-                sp_256_proj_point_dbl_10(&t[2], &t[2]);
+                dbg("t[2] *= 2\n");
+                sp_256_proj_point_dbl_8(&t[2], &t[2]);
+                dump_512("t[2].x %s\n", t[2].x);
+                dump_512("t[2].y %s\n", t[2].y);
+                dump_512("t[2].z %s\n", t[2].z);
                memcpy(&t[y], &t[2], sizeof(sp_point));
+                n <<= 1;
+                c--;
        }
        if (map)
-                sp_256_map_10(r, &t[0]);
+                sp_256_map_8(r, &t[0]);
        else
                memcpy(r, &t[0], sizeof(sp_point));
@@ -809,7 +1304,7 @@ static void sp_256_ecc_mulmod_10(sp_point* r, const sp_point* g, const sp_digit*
 * k     Scalar to multiply by.
 * map   Indicates whether to convert result to affine.
 */
-static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/)
+static void sp_256_ecc_mulmod_base_8(sp_point* r, sp_digit* k /*, int map*/)
 {
        /* Since this function is called only once, save space:
         * don't have "static const sp_point p256_base = {...}",
@@ -826,7 +1321,7 @@ static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/)
        sp_256_point_from_bin2x32(&p256_base, p256_base_bin);
-        sp_256_ecc_mulmod_10(r, &p256_base, k /*, map*/);
+        sp_256_ecc_mulmod_8(r, &p256_base, k /*, map*/);
 }
 /* Multiply the point by the scalar and serialize the X ordinate.
@@ -836,7 +1331,7 @@ static void sp_256_ecc_mulmod_base_10(sp_point* r, sp_digit* k /*, int map*/)
 * pub2x32 Point to multiply.
 * out32   Buffer to hold X ordinate.
 */
-static void sp_ecc_secret_gen_256(const sp_digit priv[10], const uint8_t *pub2x32, uint8_t* out32)
+static void sp_ecc_secret_gen_256(const sp_digit priv[8], const uint8_t *pub2x32, uint8_t* out32)
 {
        sp_point point[1];
@@ -847,66 +1342,51 @@ static void sp_ecc_secret_gen_256(const sp_digit priv[10], const uint8_t *pub2x3
        dump_hex("        %s\n", pub2x32 + 32, 32);
        sp_256_point_from_bin2x32(point, pub2x32);
-        dump_hex("point->x %s\n", point->x, sizeof(point->x));
+        dump_512("point->x %s\n", point->x);
-        dump_hex("point->y %s\n", point->y, sizeof(point->y));
+        dump_512("point->y %s\n", point->y);
-        sp_256_ecc_mulmod_10(point, point, priv);
+        sp_256_ecc_mulmod_8(point, point, priv);
-        sp_256_to_bin(point->x, out32);
+        sp_256_to_bin_8(point->x, out32);
        dump_hex("out32: %s\n", out32, 32);
 }
-/* Generates a scalar that is in the range 1..order-1. */
+/* Generates a random scalar in [1..order-1] range. */
-#define SIMPLIFY 1
+static void sp_256_ecc_gen_k_8(sp_digit k[8])
-/* Add 1 to a. (a = a + 1) */
-static void sp_256_add_one_10(sp_digit* a)
 {
-        a[0]++;
+        /* Since 32-bit words are "dense", no need to use
-        sp_256_norm_10(a);
+         * sp_256_from_bin_8(k, buf) to convert random stream
-}
+         * to sp_digit array - just store random bits there directly.
-static void sp_256_ecc_gen_k_10(sp_digit k[10])
+         */
-{
+        tls_get_random(k, 8 * sizeof(k[0]));
-#if !SIMPLIFY
-        /* The order of the curve P256 minus 2. */
-        static const sp_digit p256_order2[10] = {
-                0x063254f,0x272b0bf,0x1e84f3b,0x2b69c5e,0x3bce6fa,
-                0x3ffffff,0x3ffffff,0x00003ff,0x3ff0000,0x03fffff,
-        };
-#endif
-        uint8_t buf[32];
-        for (;;) {
-                tls_get_random(buf, sizeof(buf));
 #if FIXED_SECRET
-                memset(buf, 0x77, sizeof(buf));
+        memset(k, 0x77, 8 * sizeof(k[0]));
-#endif
-                sp_256_from_bin(k, 10, buf, sizeof(buf));
-#if !SIMPLIFY
-                if (sp_256_cmp_10(k, p256_order2) < 0)
-                        break;
-#else
-                /* non-loopy version (and not needing p256_order2[]):
-                 * if most-significant word seems that k can be larger
-                 * than p256_order2, fix it up:
-                 */
-                if (k[9] >= 0x03fffff)
-                        k[9] = 0x03ffffe;
-                break;
 #endif
-        }
-        sp_256_add_one_10(k);
+// If scalar is too large, try again (pseudo-code)
-#undef SIMPLIFY
+//      if (k >= 0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc632551 - 1) // order of P256
+//              goto pick_another_random;
+//      k++; // ensure non-zero
+        /* Simpler alternative, at the cost of not choosing some valid
+         * random values, and slightly non-uniform distribution */
+        if (k[0] == 0)
+                k[0] = 1;
+        if (k[7] >= 0xffffffff)
+                k[7] = 0xfffffffe;
 }
 /* Makes a random EC key pair. */
-static void sp_ecc_make_key_256(sp_digit privkey[10], uint8_t *pubkey)
+static void sp_ecc_make_key_256(sp_digit privkey[8], uint8_t *pubkey)
 {
        sp_point point[1];
-        sp_256_ecc_gen_k_10(privkey);
+        sp_256_ecc_gen_k_8(privkey);
-        sp_256_ecc_mulmod_base_10(point, privkey);
+        dump_256("privkey %s\n", privkey);
-        sp_256_to_bin(point->x, pubkey);
+        sp_256_ecc_mulmod_base_8(point, privkey);
-        sp_256_to_bin(point->y, pubkey + 32);
+        dump_512("point->x %s\n", point->x);
+        dump_512("point->y %s\n", point->y);
+        sp_256_to_bin_8(point->x, pubkey);
+        sp_256_to_bin_8(point->y, pubkey + 32);
        memset(point, 0, sizeof(point)); //paranoia
 }
@@ -915,8 +1395,9 @@ void FAST_FUNC curve_P256_compute_pubkey_and_premaster(
                uint8_t *pubkey2x32, uint8_t *premaster32,
                const uint8_t *peerkey2x32)
 {
-        sp_digit privkey[10];
+        sp_digit privkey[8];
+        dump_hex("peerkey2x32: %s\n", peerkey2x32, 64);
        sp_ecc_make_key_256(privkey, pubkey2x32);
        dump_hex("pubkey: %s\n", pubkey2x32, 32);
        dump_hex("        %s\n", pubkey2x32 + 32, 32);
diff --git a/networking/tls_symmetric.h b/networking/tls_symmetric.h
deleted file mode 100644
index 5e0e4b6d8..000000000
--- a/networking/tls_symmetric.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- * Copyright (C) 2017 Denys Vlasenko
- *
- * Licensed under GPLv2, see file LICENSE in this source tree.
- */
-/* The part below is a section of matrixssl-3-7-2b-open/crypto/cryptolib.h
- * Changes are flagged with //bbox
- */
-/******************************************************************************/
-/* 32-bit Rotates */
-/******************************************************************************/
-#if defined(_MSC_VER)
-/******************************************************************************/
-/* instrinsic rotate */
-#include <stdlib.h>
-#pragma intrinsic(_lrotr,_lrotl)
-#define ROR(x,n) _lrotr(x,n)
-#define ROL(x,n) _lrotl(x,n)
-/******************************************************************************/
-#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && \
-                !defined(INTEL_CC) && !defined(PS_NO_ASM)
-static ALWAYS_INLINE unsigned ROL(unsigned word, int i)
-{
- if (__builtin_constant_p(i)) { //box
-   // Rotates by constant use fewer registers,
-   // and on many Intel CPUs rotates by %cl take 2 cycles, not 1.
-   asm ("roll %2,%0"
-          :"=r" (word)
-          :"0" (word),"i" (i));
-   return word;
- } //box
- asm ("roll %%cl,%0"
-          :"=r" (word)
-          :"0" (word),"c" (i));
- return word;
-}
-static ALWAYS_INLINE unsigned ROR(unsigned word, int i)
-{
- if (__builtin_constant_p(i)) { //box
-   asm ("rorl %2,%0"
-          :"=r" (word)
-          :"0" (word),"i" (i));
-   return word;
- } //box
- asm ("rorl %%cl,%0"
-          :"=r" (word)
-          :"0" (word),"c" (i));
- return word;
-}
-/******************************************************************************/
-#else
-/* rotates the hard way */
-#define ROL(x, y) \
-        ( (((unsigned long)(x)<<(unsigned long)((y)&31)) | \
-        (((unsigned long)(x)&0xFFFFFFFFUL)>>(unsigned long)(32-((y)&31)))) & \
-        0xFFFFFFFFUL)
-#define ROR(x, y) \
-        ( ((((unsigned long)(x)&0xFFFFFFFFUL)>>(unsigned long)((y)&31)) | \
-        ((unsigned long)(x)<<(unsigned long)(32-((y)&31)))) & 0xFFFFFFFFUL)
-#endif /* 32-bit Rotates */
-/******************************************************************************/
-#ifdef HAVE_NATIVE_INT64
-#ifdef _MSC_VER
-        #define CONST64(n) n ## ui64
-#else
-        #define CONST64(n) n ## ULL
-#endif
-#endif
-/******************************************************************************/
-/*
-        Endian helper macros
- */
-#if defined (ENDIAN_NEUTRAL)
-#define STORE32L(x, y) { \
-(y)[3] = (unsigned char)(((x)>>24)&255); \
-(y)[2] = (unsigned char)(((x)>>16)&255);  \
-(y)[1] = (unsigned char)(((x)>>8)&255); \
-(y)[0] = (unsigned char)((x)&255); \
-}
-#define LOAD32L(x, y) { \
-x = ((unsigned long)((y)[3] & 255)<<24) | \
-((unsigned long)((y)[2] & 255)<<16) | \
-((unsigned long)((y)[1] & 255)<<8)  | \
-((unsigned long)((y)[0] & 255)); \
-}
-#define STORE64L(x, y) { \
-(y)[7] = (unsigned char)(((x)>>56)&255); \
-(y)[6] = (unsigned char)(((x)>>48)&255); \
-(y)[5] = (unsigned char)(((x)>>40)&255); \
-(y)[4] = (unsigned char)(((x)>>32)&255); \
-(y)[3] = (unsigned char)(((x)>>24)&255); \
-(y)[2] = (unsigned char)(((x)>>16)&255); \
-(y)[1] = (unsigned char)(((x)>>8)&255); \
-(y)[0] = (unsigned char)((x)&255); \
-}
-#define LOAD64L(x, y) { \
-x = (((uint64)((y)[7] & 255))<<56)|(((uint64)((y)[6] & 255))<<48)| \
-(((uint64)((y)[5] & 255))<<40)|(((uint64)((y)[4] & 255))<<32)| \
-(((uint64)((y)[3] & 255))<<24)|(((uint64)((y)[2] & 255))<<16)| \
-(((uint64)((y)[1] & 255))<<8)|(((uint64)((y)[0] & 255))); \
-}
-#define STORE32H(x, y) { \
-(y)[0] = (unsigned char)(((x)>>24)&255); \
-(y)[1] = (unsigned char)(((x)>>16)&255); \
-(y)[2] = (unsigned char)(((x)>>8)&255); \
-(y)[3] = (unsigned char)((x)&255); \
-}
-#define LOAD32H(x, y) { \
-x = ((unsigned long)((y)[0] & 255)<<24) | \
-((unsigned long)((y)[1] & 255)<<16) | \
-((unsigned long)((y)[2] & 255)<<8)  | \
-((unsigned long)((y)[3] & 255)); \
-}
-#define STORE64H(x, y) { \
-(y)[0] = (unsigned char)(((x)>>56)&255); \
-(y)[1] = (unsigned char)(((x)>>48)&255); \
-(y)[2] = (unsigned char)(((x)>>40)&255); \
-(y)[3] = (unsigned char)(((x)>>32)&255); \
-(y)[4] = (unsigned char)(((x)>>24)&255); \
-(y)[5] = (unsigned char)(((x)>>16)&255); \
-(y)[6] = (unsigned char)(((x)>>8)&255); \
-(y)[7] = (unsigned char)((x)&255); \
-}
-#define LOAD64H(x, y) { \
-x = (((uint64)((y)[0] & 255))<<56)|(((uint64)((y)[1] & 255))<<48) | \
-(((uint64)((y)[2] & 255))<<40)|(((uint64)((y)[3] & 255))<<32) | \
-(((uint64)((y)[4] & 255))<<24)|(((uint64)((y)[5] & 255))<<16) | \
-(((uint64)((y)[6] & 255))<<8)|(((uint64)((y)[7] & 255))); \
-}
-#endif /* ENDIAN_NEUTRAL */
-#ifdef ENDIAN_LITTLE
-#define STORE32H(x, y) { \
-(y)[0] = (unsigned char)(((x)>>24)&255); \
-(y)[1] = (unsigned char)(((x)>>16)&255); \
-(y)[2] = (unsigned char)(((x)>>8)&255); \
-(y)[3] = (unsigned char)((x)&255); \
-}
-#define LOAD32H(x, y) { \
-x = ((unsigned long)((y)[0] & 255)<<24) | \
-((unsigned long)((y)[1] & 255)<<16) | \
-((unsigned long)((y)[2] & 255)<<8)  | \
-((unsigned long)((y)[3] & 255)); \
-}
-#define STORE64H(x, y) { \
-(y)[0] = (unsigned char)(((x)>>56)&255); \
-(y)[1] = (unsigned char)(((x)>>48)&255); \
-(y)[2] = (unsigned char)(((x)>>40)&255); \
-(y)[3] = (unsigned char)(((x)>>32)&255); \
-(y)[4] = (unsigned char)(((x)>>24)&255); \
-(y)[5] = (unsigned char)(((x)>>16)&255); \
-(y)[6] = (unsigned char)(((x)>>8)&255); \
-(y)[7] = (unsigned char)((x)&255); \
-}
-#define LOAD64H(x, y) { \
-x = (((uint64)((y)[0] & 255))<<56)|(((uint64)((y)[1] & 255))<<48) | \
-(((uint64)((y)[2] & 255))<<40)|(((uint64)((y)[3] & 255))<<32) | \
-(((uint64)((y)[4] & 255))<<24)|(((uint64)((y)[5] & 255))<<16) | \
-(((uint64)((y)[6] & 255))<<8)|(((uint64)((y)[7] & 255))); }
-#ifdef ENDIAN_32BITWORD
-#define STORE32L(x, y) { \
-unsigned long __t = (x); memcpy(y, &__t, 4); \
-}
-#define LOAD32L(x, y)  memcpy(&(x), y, 4);
-#define STORE64L(x, y) { \
-(y)[7] = (unsigned char)(((x)>>56)&255); \
-(y)[6] = (unsigned char)(((x)>>48)&255); \
-(y)[5] = (unsigned char)(((x)>>40)&255); \
-(y)[4] = (unsigned char)(((x)>>32)&255); \
-(y)[3] = (unsigned char)(((x)>>24)&255); \
-(y)[2] = (unsigned char)(((x)>>16)&255); \
-(y)[1] = (unsigned char)(((x)>>8)&255); \
-(y)[0] = (unsigned char)((x)&255); \
-}
-#define LOAD64L(x, y) { \
-x = (((uint64)((y)[7] & 255))<<56)|(((uint64)((y)[6] & 255))<<48)| \
-(((uint64)((y)[5] & 255))<<40)|(((uint64)((y)[4] & 255))<<32)| \
-(((uint64)((y)[3] & 255))<<24)|(((uint64)((y)[2] & 255))<<16)| \
-(((uint64)((y)[1] & 255))<<8)|(((uint64)((y)[0] & 255))); \
-}
-#else /* 64-bit words then  */
-#define STORE32L(x, y) \
-{ unsigned long __t = (x); memcpy(y, &__t, 4); }
-#define LOAD32L(x, y) \
-{ memcpy(&(x), y, 4); x &= 0xFFFFFFFF; }
-#define STORE64L(x, y) \
-{ uint64 __t = (x); memcpy(y, &__t, 8); }
-#define LOAD64L(x, y) \
-{ memcpy(&(x), y, 8); }
-#endif /* ENDIAN_64BITWORD */
-#endif /* ENDIAN_LITTLE */
-#ifdef ENDIAN_BIG
-#define STORE32L(x, y) { \
-(y)[3] = (unsigned char)(((x)>>24)&255); \
-(y)[2] = (unsigned char)(((x)>>16)&255); \
-(y)[1] = (unsigned char)(((x)>>8)&255); \
-(y)[0] = (unsigned char)((x)&255); \
-}
-#define LOAD32L(x, y) { \
-x = ((unsigned long)((y)[3] & 255)<<24) | \
-((unsigned long)((y)[2] & 255)<<16) | \
-((unsigned long)((y)[1] & 255)<<8)  | \
-((unsigned long)((y)[0] & 255)); \
-}
-#define STORE64L(x, y) { \
-(y)[7] = (unsigned char)(((x)>>56)&255); \
-(y)[6] = (unsigned char)(((x)>>48)&255); \
-(y)[5] = (unsigned char)(((x)>>40)&255); \
-(y)[4] = (unsigned char)(((x)>>32)&255); \
-(y)[3] = (unsigned char)(((x)>>24)&255); \
-(y)[2] = (unsigned char)(((x)>>16)&255); \
-(y)[1] = (unsigned char)(((x)>>8)&255); \
-(y)[0] = (unsigned char)((x)&255); \
-}
-#define LOAD64L(x, y) { \
-x = (((uint64)((y)[7] & 255))<<56)|(((uint64)((y)[6] & 255))<<48) | \
-(((uint64)((y)[5] & 255))<<40)|(((uint64)((y)[4] & 255))<<32) | \
-(((uint64)((y)[3] & 255))<<24)|(((uint64)((y)[2] & 255))<<16) | \
-(((uint64)((y)[1] & 255))<<8)|(((uint64)((y)[0] & 255))); \
-}
-#ifdef ENDIAN_32BITWORD
-#define STORE32H(x, y) \
-{ unsigned int __t = (x); memcpy(y, &__t, 4); }
-#define LOAD32H(x, y) memcpy(&(x), y, 4);
-#define STORE64H(x, y) { \
-(y)[0] = (unsigned char)(((x)>>56)&255); \
-(y)[1] = (unsigned char)(((x)>>48)&255); \
-(y)[2] = (unsigned char)(((x)>>40)&255); \
-(y)[3] = (unsigned char)(((x)>>32)&255); \
-(y)[4] = (unsigned char)(((x)>>24)&255); \
-(y)[5] = (unsigned char)(((x)>>16)&255); \
-(y)[6] = (unsigned char)(((x)>>8)&255); \
-(y)[7] = (unsigned char)((x)&255); \
-}
-#define LOAD64H(x, y) { \
-x = (((uint64)((y)[0] & 255))<<56)|(((uint64)((y)[1] & 255))<<48)| \
-(((uint64)((y)[2] & 255))<<40)|(((uint64)((y)[3] & 255))<<32)| \
-(((uint64)((y)[4] & 255))<<24)|(((uint64)((y)[5] & 255))<<16)| \
-(((uint64)((y)[6] & 255))<<8)| (((uint64)((y)[7] & 255))); \
-}
-#else /* 64-bit words then  */
-#define STORE32H(x, y) \
-{ unsigned long __t = (x); memcpy(y, &__t, 4); }
-#define LOAD32H(x, y) \
-{ memcpy(&(x), y, 4); x &= 0xFFFFFFFF; }
-#define STORE64H(x, y) \
-{ uint64 __t = (x); memcpy(y, &__t, 8); }
-#define LOAD64H(x, y) \
-{ memcpy(&(x), y, 8); }
-#endif /* ENDIAN_64BITWORD */
-#endif /* ENDIAN_BIG */
-#ifdef HAVE_NATIVE_INT64
-#define ROL64c(x, y) \
-( (((x)<<((uint64)(y)&63)) | \
-(((x)&CONST64(0xFFFFFFFFFFFFFFFF))>>((uint64)64-((y)&63)))) & CONST64(0xFFFFFFFFFFFFFFFF))
-#define ROR64c(x, y) \
-( ((((x)&CONST64(0xFFFFFFFFFFFFFFFF))>>((uint64)(y)&CONST64(63))) | \
-((x)<<((uint64)(64-((y)&CONST64(63)))))) & CONST64(0xFFFFFFFFFFFFFFFF))
-#endif /* HAVE_NATIVE_INT64 */
-/******************************************************************************/
-/* The part below is taken almost verbatim from matrixssl-3-7-2b-open/crypto/symmetric/.
- * Changes are flagged with //bbox
- */
-/**
- *      @file    symmetric.h
- *      @version 33ef80f (HEAD, tag: MATRIXSSL-3-7-2-OPEN, tag: MATRIXSSL-3-7-2-COMM, origin/master, origin/HEAD, master)
- *
- *      Header for internal symmetric key cryptography support.
- */
-/*
- *      Copyright (c) 2013-2015 INSIDE Secure Corporation
- *      Copyright (c) PeerSec Networks, 2002-2011
- *      All Rights Reserved
- *
- *      The latest version of this code is available at http://www.matrixssl.org
- *
- *      This software is open source; you can redistribute it and/or modify
- *      it under the terms of the GNU General Public License as published by
- *      the Free Software Foundation; either version 2 of the License, or
- *      (at your option) any later version.
- *
- *      This General Public License does NOT permit incorporating this software
- *      into proprietary programs.  If you are unable to comply with the GPL, a
- *      commercial license for this software may be purchased from INSIDE at
- *      http://www.insidesecure.com/eng/Company/Locations
- *
- *      This program is distributed in WITHOUT ANY WARRANTY; without even the
- *      implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *      See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License
- *      along with this program; if not, write to the Free Software
- *      Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *      http://www.gnu.org/copyleft/gpl.html
- */
-/******************************************************************************/
-#ifndef _h_PS_SYMMETRIC
-#define _h_PS_SYMMETRIC
-/******************************************************************************/
-#ifdef USE_AES
-/******************************************************************************/
-#ifndef USE_AES_CBC_EXTERNAL
-typedef struct {
-        uint32 eK[64], dK[64];
-        int32 Nr;
-} psAesKey_t;
-typedef struct {
-        int32                   blocklen;
-        unsigned char   IV[16];
-        psAesKey_t              key;
-#if defined(USE_AES_GCM) || defined(USE_AES_CCM)
-        unsigned char   EncCtr[16];
-        unsigned char   CtrBlock[16];
-#endif
-#ifdef USE_AES_GCM
-        unsigned char   gInit[16];
-        uint32                  TagTemp[4];
-        unsigned char   Hash_SubKey[16];
-        uint32                  ProcessedBitCount[4];
-        uint32                  InputBufferCount;
-        uint32                  OutputBufferCount;
-        union
-        {
-                unsigned char Buffer[128];
-                uint32 BufferAlignment;
-        } Input;
-#endif /* USE_AES_GCM */
-#ifdef USE_AES_CCM
-        uint32_t ccmTagTemp[16 / sizeof(uint32_t)]; /* 32 */
-        union
-        {
-                /* Used for formatting IV. */
-                uint8_t Temporary[16];
-                /* Used for processing Mac. */
-                uint8_t Y0[16];
-        } u; /* 48 */
-#endif /* USE_AES_CCM */
-} psAesCipher_t;
-#endif /* USE_AES_CBC_EXTERNAL */
-#endif /* USE_AES */
-#ifdef USE_IDEA
-#define SSL_IDEA_KEY_LEN        16
-#define SSL_IDEA_IV_LEN         8
-#define SSL_IDEA_BLOCK_LEN      8
-typedef struct {
-        uint16  key_schedule[52];
-} psIdeaKey_t;
-typedef struct {
-        psIdeaKey_t             key;
-        uint32                  IV[2];
-        short                   for_encryption;
-        short                   inverted;
-} idea_CBC;
-#endif
-/******************************************************************************/
-/******************************************************************************/
-#ifdef USE_SEED
-/******************************************************************************/
-#define SSL_SEED_KEY_LEN        16
-#define SSL_SEED_IV_LEN         16
-typedef struct {
-        uint32 K[32], dK[32];
-} psSeedKey_t;
-typedef struct {
-        int32                   blocklen;
-        unsigned char   IV[16];
-        psSeedKey_t             key;
-} seed_CBC;
-#endif /* USE_SEED */
-/******************************************************************************/
-/******************************************************************************/
-#if defined(USE_3DES) || defined(USE_DES)
-/******************************************************************************/
-#define DES3_KEY_LEN    24
-#define DES3_IV_LEN             8
-#define DES_KEY_LEN             8
-typedef struct {
-        uint32 ek[3][32], dk[3][32];
-} psDes3Key_t;
-/*
-        A block cipher CBC structure
- */
-typedef struct {
-        int32                           blocklen;
-        unsigned char           IV[8];
-        psDes3Key_t                     key;
-} des3_CBC;
-#endif /* USE_3DES || USE_DES */
-/******************************************************************************/
-/******************************************************************************/
-#ifdef USE_ARC4
-typedef struct {
-        unsigned char   state[256];
-        uint32  byteCount;
-        unsigned char   x;
-        unsigned char   y;
-} psRc4Key_t;
-#endif /* USE_ARC4 */
-/******************************************************************************/
-#ifdef USE_RC2
-typedef struct {
-        unsigned xkey[64];
-} psRc2Key_t;
-typedef struct {
-        int32                           blocklen;
-        unsigned char           IV[8];
-        psRc2Key_t                      key;
-} rc2_CBC;
-#endif /* USE_RC2 */
-/******************************************************************************/
-/*      Universal types and defines */
-/******************************************************************************/
-#define MAXBLOCKSIZE    24
-typedef union {
-#ifdef USE_RC2
-        rc2_CBC         rc2;
-#endif
-#ifdef USE_ARC4
-        psRc4Key_t      arc4;
-#endif
-#ifdef USE_3DES
-        des3_CBC        des3;
-#endif
-#ifdef USE_AES
-        psAesCipher_t   aes;
-#endif
-#ifdef USE_SEED
-        seed_CBC        seed;
-#endif
-#ifdef USE_IDEA
-        idea_CBC        idea;
-#endif
-} psCipherContext_t;
-#define byte(x, n) (((x) >> (8 * (n))) & 255)
-#endif /* _h_PS_SYMMETRIC */
-/******************************************************************************/
diff --git a/networking/wget.c b/networking/wget.c
index a5369be22..85a04eaba 100644
--- a/networking/wget.c
+++ b/networking/wget.c
@@ -135,7 +135,8 @@
 //usage:#define wget_trivial_usage
 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
-//usage:       "[-cqS] [--spider] [-O FILE] [-o LOGFILE] [--header 'HEADER: VALUE'] [-Y on/off]\n"
+//usage:       "[-cqS] [--spider] [-O FILE] [-o LOGFILE] [--header STR]\n"
+//usage:       "        [--post-data STR | --post-file FILE] [-Y on/off]\n"
 /* Since we ignore these opts, we don't show them in --help */
 /* //usage:    "        [--no-cache] [--passive-ftp] [-t TRIES]" */
 /* //usage:    "        [-nv] [-nc] [-nH] [-np]" */
@@ -148,6 +149,9 @@
 //usage:       "Retrieve files via HTTP or FTP\n"
 //usage:        IF_FEATURE_WGET_LONG_OPTIONS(
 //usage:     "\n        --spider        Only check URL existence: $? is 0 if exists"
+//usage:     "\n        --header STR    Add STR (of form 'header: value') to headers"
+//usage:     "\n        --post-data STR Send STR using POST method"
+//usage:     "\n        --post-file FILE        Send FILE using POST method"
 //usage:        IF_FEATURE_WGET_OPENSSL(
 //usage:     "\n        --no-check-certificate  Don't validate the server's certificate"
 //usage:        )
@@ -244,6 +248,7 @@ struct globals {
        char *dir_prefix;
 #if ENABLE_FEATURE_WGET_LONG_OPTIONS
        char *post_data;
+        char *post_file;
        char *extra_headers;
        unsigned char user_headers; /* Headers mentioned by the user */
 #endif
@@ -292,10 +297,13 @@ enum {
        WGET_OPT_POST_DATA  = (1 << 12) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
        WGET_OPT_SPIDER     = (1 << 13) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
        WGET_OPT_NO_CHECK_CERT = (1 << 14) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
+        WGET_OPT_POST_FILE  = (1 << 15) * ENABLE_FEATURE_WGET_LONG_OPTIONS,
        /* hijack this bit for other than opts purposes: */
        WGET_NO_FTRUNCATE   = (1 << 31)
 };
+#define WGET_OPT_POST (WGET_OPT_POST_DATA | WGET_OPT_POST_FILE)
 enum {
        PROGRESS_START = -1,
        PROGRESS_END   = 0,
@@ -1246,7 +1254,7 @@ static void download_one_url(const char *url)
                                target.path);
                } else {
                        SENDFMT(sfp, "%s /%s HTTP/1.1\r\n",
-                                (option_mask32 & WGET_OPT_POST_DATA) ? "POST" : "GET",
+                                (option_mask32 & WGET_OPT_POST) ? "POST" : "GET",
                                target.path);
                }
                if (!USR_HEADER_HOST)
@@ -1279,7 +1287,13 @@ static void download_one_url(const char *url)
                        fputs(G.extra_headers, sfp);
                }
-                if (option_mask32 & WGET_OPT_POST_DATA) {
+                if (option_mask32 & WGET_OPT_POST_FILE) {
+                        int fd = xopen_stdin(G.post_file);
+                        G.post_data = xmalloc_read(fd, NULL);
+                        close(fd);
+                }
+                if (G.post_data) {
                        SENDFMT(sfp,
                                "Content-Type: application/x-www-form-urlencoded\r\n"
                                "Content-Length: %u\r\n"
@@ -1522,6 +1536,7 @@ IF_DESKTOP(	"tries\0"            Required_argument "t")
                "post-data\0"        Required_argument "\xfe"
                "spider\0"           No_argument       "\xfd"
                "no-check-certificate\0" No_argument   "\xfc"
+                "post-file\0"        Required_argument "\xfb"
                /* Ignored (we always use PASV): */
 IF_DESKTOP(     "passive-ftp\0"      No_argument       "\xf0")
                /* Ignored (we don't support caching) */
@@ -1565,6 +1580,9 @@ IF_DESKTOP(	"no-parent\0"        No_argument       "\xf0")
                 */
                "\0"
                "-1" /* at least one URL */
+                IF_FEATURE_WGET_LONG_OPTIONS(":\xfe--\xfb")
+                IF_FEATURE_WGET_LONG_OPTIONS(":\xfe--\xfe")
+                IF_FEATURE_WGET_LONG_OPTIONS(":\xfb--\xfb")
                IF_FEATURE_WGET_LONG_OPTIONS(":\xff::") /* --header is a list */
                LONGOPTS
                , &G.fname_out, &G.fname_log, &G.dir_prefix,
@@ -1574,6 +1592,7 @@ IF_DESKTOP(	"no-parent\0"        No_argument       "\xf0")
                NULL  /* -n[ARG] */
                IF_FEATURE_WGET_LONG_OPTIONS(, &headers_llist)
                IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_data)
+                IF_FEATURE_WGET_LONG_OPTIONS(, &G.post_file)
        );
 #if 0 /* option bits debug */
        if (option_mask32 & WGET_OPT_RETRIES) bb_error_msg("-t NUM");
@@ -1582,6 +1601,7 @@ IF_DESKTOP(	"no-parent\0"        No_argument       "\xf0")
        if (option_mask32 & WGET_OPT_POST_DATA) bb_error_msg("--post-data");
        if (option_mask32 & WGET_OPT_SPIDER) bb_error_msg("--spider");
        if (option_mask32 & WGET_OPT_NO_CHECK_CERT) bb_error_msg("--no-check-certificate");
+        if (option_mask32 & WGET_OPT_POST_FILE) bb_error_msg("--post-file");
        exit(0);
 #endif
        argv += optind;
author	Ron Yorston <rmy@pobox.com>	2021-10-13 14:37:51 +0100
committer	Ron Yorston <rmy@pobox.com>	2021-10-13 14:37:51 +0100
commit	0ecf1aea459571b48dc68ddc2b7b9265740fa960 (patch)
tree	491d6184a44b8b525a4ca35759d622aecd7f6344 /networking
parent	4859ddcb20616718efbea12c6bf8b27c469b68de (diff)
parent	aaf3d5ba74c5da97ff80b61f30cb8dd225d39096 (diff)
download	busybox-w32-0ecf1aea459571b48dc68ddc2b7b9265740fa960.tar.gz busybox-w32-0ecf1aea459571b48dc68ddc2b7b9265740fa960.tar.bz2 busybox-w32-0ecf1aea459571b48dc68ddc2b7b9265740fa960.zip