Source: gmp
Version: 2:6.1.2+dfsg-1
Severity: wishlist
Tags: patch

Dear Maintainer,

As --enable-fat works only on i386^1, on armhf neon-optimized code is never used, rendering gmp much slower than it can be.

As a workaround, I suggest to compile and install separate neon-optimized library in $(libdir)/neon/vfp and rely on ld.so for runtime-detection.

Debdiff attached, passed limited testing ([1] cross-compiled [implies nocheck] `pbuilder --host-arch=armhf --arch=i386`, installed on rpi3b+/raspbian-stretch, benchmarked; [2] native recompilation on rpi3b+/raspbian [passed regression tests], then installed and benchmarked);

No idea how well this would work on hurd, kfreebsd, etc.

I hardcoded armv7 for neon code: debian's armhf requires minimum armv7, so this should be acceptable; I have not tested, but this should not break "fake armhf" from raspbian (rpi1 is armv6 without neon, so new neon-optimized variant would not be picked; newer rpi are armv7+ and have neon).

Potential pitfalls: it enables previously untested (at least, by debian) code on affected platforms, some bugs can lurk there.

About expected user-visible effect, on Raspberry Pi 3B+ (BCM2837B0, 4-core Cortex-A53 @1.4GHz), `gnutls-cli --benchmark-tls-kx`:

Before:
Testing key exchanges (RSA/DH bits: 3072, EC bits: 256)
(TLS1.2)-(DHE-RSA-3072)-(AES-128-CBC)-(SHA1)  5.50 transactions/sec
           (avg. handshake time: 181.71 ms, sample variance: 0.43)
(TLS1.2)-(ECDHE-RSA-SECP256R1)-(AES-128-CBC)-(SHA1)  12.08 transactions/sec
           (avg. handshake time: 82.77 ms, sample variance: 0.18)
(TLS1.2)-(ECDHE-RSA-X25519)-(AES-128-CBC)-(SHA1)  12.32 transactions/sec
           (avg. handshake time: 81.03 ms, sample variance: 0.03)
(TLS1.2)-(ECDHE-ECDSA-SECP256R1)-(AES-128-CBC)-(SHA1) 77.88 transactions/sec
           (avg. handshake time: 12.73 ms, sample variance: 0.20)
(TLS1.2)-(ECDHE-ECDSA-X25519)-(AES-128-CBC)-(SHA1)  88.98 transactions/sec
           (avg. handshake time: 11.13 ms, sample variance: 0.11)
   (TLS1.2)-(RSA)-(AES-128-CBC)-(SHA1)  13.15 transactions/sec
           (avg. handshake time: 75.86 ms, sample variance: 0.12)

After:
Testing key exchanges (RSA/DH bits: 3072, EC bits: 256)
(TLS1.2)-(DHE-RSA-3072)-(AES-128-CBC)-(SHA1)  8.40 transactions/sec
           (avg. handshake time: 118.98 ms, sample variance: 0.27)
(TLS1.2)-(ECDHE-RSA-SECP256R1)-(AES-128-CBC)-(SHA1)  18.42 transactions/sec
           (avg. handshake time: 54.23 ms, sample variance: 0.18)
(TLS1.2)-(ECDHE-RSA-X25519)-(AES-128-CBC)-(SHA1)  18.88 transactions/sec
           (avg. handshake time: 52.82 ms, sample variance: 0.15)
(TLS1.2)-(ECDHE-ECDSA-SECP256R1)-(AES-128-CBC)-(SHA1) 93.89 transactions/sec
           (avg. handshake time: 10.54 ms, sample variance: 0.25)
(TLS1.2)-(ECDHE-ECDSA-X25519)-(AES-128-CBC)-(SHA1)  106.83 transactions/sec
           (avg. handshake time: 9.25 ms, sample variance: 0.19)
   (TLS1.2)-(RSA)-(AES-128-CBC)-(SHA1)  20.46 transactions/sec
           (avg. handshake time: 48.78 ms, sample variance: 0.18)

That is, 20% to 50% speedup.

^1 --enable-fat works on amd64 too - but debian disables it; maybe, it's time to reconsider? some related bugs was fixed upstream since last attempt (which resulted in #671866); that said, on my cpu amd64+fat is slower than current debian-packaged "fat-free" code, so I'm not very much interested.

-- System Information:
Debian Release: 9.6
  APT prefers stable-updates
APT policy: (500, 'stable-updates'), (500, 'stable-debug'), (500, 'proposed-updates-debug'), (500, 'proposed-updates'), (500, 'stable')
Architecture: i386 (x86_64)
Foreign Architectures: amd64

Kernel: Linux 4.9.0-6-amd64 (SMP w/2 CPU cores)
Locale: LANG=ru_RU.KOI8-R, LC_CTYPE=ru_RU.KOI8-R (charmap=KOI8-R), LANGUAGE=ru_RU.KOI8-R (charmap=KOI8-R)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)

diff -Nru gmp-6.1.2+dfsg/debian/rules gmp-6.1.2+dfsg/debian/rules
--- gmp-6.1.2+dfsg/debian/rules 2016-12-21 08:38:23.000000000 +0300
+++ gmp-6.1.2+dfsg/debian/rules 2019-01-02 22:52:33.000000000 +0300
@@ -68,9 +68,20 @@
 
 confflags_ma = $(confflags) $(confflags_build) 
--libdir=/usr/lib/$(DEB_HOST_MULTIARCH)
 
+FLAVORS = main
+LIBDIR_main =
+
 CC   = $(DEB_HOST_GNU_TYPE)-gcc
 CXX   = $(DEB_HOST_GNU_TYPE)-g++
 
+ifneq (,$(filter armhf, $(DEB_HOST_ARCH)))
+FLAVORS += neon
+
+LIBDIR_neon = neon/vfp
+neon_host_type = $(patsubst 
arm-%,armcortexa7neon-unknown-%,$(DEB_HOST_GNU_TYPE))
+confflags_neon = --host=$(neon_host_type) --target=$(neon_host_type) 
--libdir=/usr/lib/$(DEB_HOST_MULTIARCH)/$(LIBDIR_neon)
+CFLAGS_neon = -march=armv7-a -mfpu=neon
+endif
 
 get-orig-source: gmp-$(ORIG_SRC_VERSION).tar.xz
        tar --xz -xf $<
@@ -88,25 +99,34 @@
 gmp-$(ORIG_SRC_VERSION).tar.xz:
        wget https://gmplib.org/download/gmp/$@
 
-configure: configure-stamp
-configure-stamp:
-       mkdir -p build
-       cd build && ../configure $(confflags_ma) \
-           AR=$(AR) CC="$(CC)" CFLAGS="$(CFLAGS)" \
-           CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS)"
+$(patsubst %,configure-%,$(FLAVORS)): configure-%: configure-stamp-%
+$(patsubst %,configure-stamp-%,$(FLAVORS)): configure-stamp-%:
+       mkdir -p build-$*
+       cd build-$* && ../configure $(confflags_ma) \
+           $(confflags_$*) \
+           AR=$(AR) CC="$(CC)" CFLAGS="$(CFLAGS) $(CFLAGS_$*)" \
+           CXX="$(CXX)" CXXFLAGS="$(CXXFLAGS) $(CFLAGS_$*)"
        touch $@
 
-build: build-stamp
-build-stamp: configure
+$(patsubst %,build-%,$(FLAVORS)): build-%: build-stamp-%
+$(patsubst %,build-stamp-%,$(FLAVORS)): build-stamp-%: configure-%
        dh_testdir
-       $(MAKE) $(JOBSFLAG) -C build
-       $(MAKE_CHECK) -C build
+       $(MAKE) $(JOBSFLAG) -C build-$*
+       $(MAKE_CHECK) -C build-$*
+       touch $@
+
+build: $(patsubst %,build-%,$(FLAVORS))
+build-stamp: $(patsubst %,build-stamp-%,$(FLAVORS))
        touch $@
 
 clean:
        dh_testdir
        dh_testroot
        rm -rf build build-stamp
+       rm -rf $(patsubst %,build-%,$(FLAVORS))
+       rm -rf $(patsubst %,build-stamp-%,$(FLAVORS))
+       rm -rf configure-stamp
+       rm -rf $(patsubst %,configure-stamp-%,$(FLAVORS))
        dh_clean
 
 install-prep:
@@ -115,13 +135,17 @@
        dh_prep
        dh_installdirs
 
+$(patsubst %,install-%,$(FLAVORS)): install-%: build-stamp-%
+       $(MAKE) DESTDIR=`pwd`/debian/tmp 
includeexecdir=/usr/include/$(DEB_HOST_MULTIARCH) -C build-$* install-exec
+       dh_install -plibgmp10 usr/lib/*/$(LIBDIR_$*)/libgmp.so.*
+
 install: build-stamp install-prep
        rm -rf debian/tmp
+       $(MAKE) -f debian/rules $(patsubst %,install-%,$(FLAVORS))
        # Install places gmp.h in 'includeexecdir' which is non-standard and 
cannot be set at compile time,
        # so override it at install.
-       $(MAKE) DESTDIR=`pwd`/debian/tmp 
includeexecdir=/usr/include/$(DEB_HOST_MULTIARCH) -C build install
+       $(MAKE) DESTDIR=`pwd`/debian/tmp 
includeexecdir=/usr/include/$(DEB_HOST_MULTIARCH) -C build-main install
 
-       dh_install -plibgmp10 usr/lib/*/libgmp.so.*
        dh_install -plibgmpxx4ldbl usr/lib/*/libgmpxx.so.*
 
        dh_install -plibgmp-dev usr/lib/*/lib*.so

Reply via email to