RPM Package Manager, CVS Repository http://rpm5.org/cvs/ ____________________________________________________________________________
Server: rpm5.org Name: Jeff Johnson Root: /v/rpm/cvs Email: j...@rpm5.org Module: rpm Date: 09-Oct-2014 04:41:04 Branch: rpm-5_4 Handle: 20141009023641240 Modified files: (Branch: rpm-5_4) rpm CHANGES rpm/rpmio Makefile.am blake2.h blake2b.c blake2bp.c blake2s.c blake2sp.c tgit.c Log: - blake2: merge SSE2 (and later) optimizations. - blake2: fix: x86_64 alignment issues for blake2s #pragma pack. Summary: Revision Changes Path 1.3501.2.453+2 -0 rpm/CHANGES 1.293.2.43 +2 -0 rpm/rpmio/Makefile.am 1.1.2.3 +5 -6 rpm/rpmio/blake2.h 1.1.2.3 +839 -43 rpm/rpmio/blake2b.c 1.1.2.5 +4 -4 rpm/rpmio/blake2bp.c 1.1.2.3 +735 -31 rpm/rpmio/blake2s.c 1.1.2.5 +4 -4 rpm/rpmio/blake2sp.c 1.1.2.45 +8 -7 rpm/rpmio/tgit.c ____________________________________________________________________________ patch -p0 <<'@@ .' Index: rpm/CHANGES ============================================================================ $ cvs diff -u -r1.3501.2.452 -r1.3501.2.453 CHANGES --- rpm/CHANGES 3 Oct 2014 19:20:10 -0000 1.3501.2.452 +++ rpm/CHANGES 9 Oct 2014 02:36:41 -0000 1.3501.2.453 @@ -1,4 +1,6 @@ 5.4.15 -> 5.4.16: + - jbj: blake2: merge SSE2 (and later) optimizations. + - jbj: blake2: fix: x86_64 alignment issues for blake2s #pragma pack. - jbj: rpmgfs: stub in macros and put/list/get/dump methods. - jbj: mongo: upgrade to mongo-c-driver 1.0.1 - jbj: mongo: resurrect unit tests. @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/Makefile.am ============================================================================ $ cvs diff -u -r1.293.2.42 -r1.293.2.43 Makefile.am --- rpm/rpmio/Makefile.am 3 Oct 2014 19:20:14 -0000 1.293.2.42 +++ rpm/rpmio/Makefile.am 9 Oct 2014 02:37:38 -0000 1.293.2.43 @@ -7,6 +7,8 @@ SUBDIRS = auto # tests +CLEANFILES = *.gcov .libs/*.gcda .libs/*.gcno *.gcno *.gcda + EXTRA_DIST = librpmio.vers \ fnmatch_loop.c getdate.y html-parse.c html-parse.h libsqlio.c \ rpmcpio.c rpmcpio.h rpmgenbasedir.c rpmgenpkglist.c rpmgensrclist.c \ @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/blake2.h ============================================================================ $ cvs diff -u -r1.1.2.2 -r1.1.2.3 blake2.h --- rpm/rpmio/blake2.h 31 Oct 2013 20:45:44 -0000 1.1.2.2 +++ rpm/rpmio/blake2.h 9 Oct 2014 02:38:24 -0000 1.1.2.3 @@ -20,7 +20,7 @@ #if defined(_MSC_VER) #define ALIGN(x) __declspec(align(x)) #else -#define ALIGN(x) __attribute__((aligned(x))) +#define ALIGN(x) __attribute__ ((__aligned__(x))) #endif #if defined(__cplusplus) @@ -43,7 +43,6 @@ BLAKE2B_PERSONALBYTES = 16 }; -#pragma pack(push, 1) typedef struct __blake2s_param { uint8_t digest_length; // 1 uint8_t key_length; // 2 @@ -53,7 +52,7 @@ uint8_t node_offset[6]; // 14 uint8_t node_depth; // 15 uint8_t inner_length; // 16 - // uint8_t reserved[0]; + uint8_t reserved[0]; uint8_t salt[BLAKE2S_SALTBYTES]; // 24 uint8_t personal[BLAKE2S_PERSONALBYTES]; // 32 } blake2s_param; @@ -65,6 +64,7 @@ uint8_t buf[2 * BLAKE2S_BLOCKBYTES]; size_t buflen; uint8_t last_node; + uint8_t pad[3]; } blake2s_state; typedef struct __blake2b_param { @@ -90,20 +90,19 @@ uint8_t last_node; } blake2b_state; - typedef struct __blake2sp_state { + ALIGN(64) typedef struct __blake2sp_state { blake2s_state S[8][1]; blake2s_state R[1]; uint8_t buf[8 * BLAKE2S_BLOCKBYTES]; size_t buflen; } blake2sp_state; - typedef struct __blake2bp_state { + ALIGN(64) typedef struct __blake2bp_state { blake2b_state S[4][1]; blake2b_state R[1]; uint8_t buf[4 * BLAKE2B_BLOCKBYTES]; size_t buflen; } blake2bp_state; -#pragma pack(pop) // Streaming API int blake2s_init(blake2s_state * S, const uint8_t outlen); @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/blake2b.c ============================================================================ $ cvs diff -u -r1.1.2.2 -r1.1.2.3 blake2b.c --- rpm/rpmio/blake2b.c 31 Oct 2013 20:45:44 -0000 1.1.2.2 +++ rpm/rpmio/blake2b.c 9 Oct 2014 02:39:16 -0000 1.1.2.3 @@ -10,21 +10,668 @@ You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. */ +#define XXXSSE #include "system.h" #include "blake2.h" #include "blake2-impl.h" +#ifdef XXXSSE /* compiler intrinsic defines may not be portable */ +#if defined(__SSE2__) +#define HAVE_SSE2 +#endif + +#if defined(__SSSE3__) +#define HAVE_SSSE3 +#endif + +#if defined(__SSE4_1__) +#define HAVE_SSE41 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__XOP__) +#define HAVE_XOP +#endif + + +#ifdef HAVE_AVX2 +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_XOP +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_AVX +#ifndef HAVE_SSE41 +#define HAVE_SSE41 +#endif +#endif + +#ifdef HAVE_SSE41 +#ifndef HAVE_SSSE3 +#define HAVE_SSSE3 +#endif +#endif + +#ifdef HAVE_SSSE3 +#define HAVE_SSE2 +#endif + +#if !defined(HAVE_SSE2) +#error "This code requires at least SSE2." +#endif + +#include <emmintrin.h> +#if defined(HAVE_SSSE3) +#include <tmmintrin.h> +#endif +#if defined(HAVE_SSE41) +#include <smmintrin.h> +#endif +#if defined(HAVE_AVX) +#include <immintrin.h> +#endif +#if defined(HAVE_XOP) +#include <x86intrin.h> +#endif +#endif /* XXXSSE */ + #include "debug.h" -static const uint64_t blake2b_IV[8] = { - 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL -}; +#ifdef XXXSSE +/*==============================================================*/ +/* --- blake2b-round.h */ + +#define LOAD(p) _mm_load_si128( (__m128i *)(p) ) +#define STORE(p,r) _mm_store_si128((__m128i *)(p), r) + +#define LOADU(p) _mm_loadu_si128( (__m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#ifndef HAVE_XOP +#ifdef HAVE_SSSE3 +#define _mm_roti_epi64(x, c) \ + (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \ + : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \ + : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \ + : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \ + : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c)))) +#else +#define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-c) )) +#endif +#else +/* ... */ +#endif + + + +#define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -32); \ + row4h = _mm_roti_epi64(row4h, -32); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -24); \ + row2h = _mm_roti_epi64(row2h, -24); \ + +#define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \ + row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \ + row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \ + \ + row4l = _mm_xor_si128(row4l, row1l); \ + row4h = _mm_xor_si128(row4h, row1h); \ + \ + row4l = _mm_roti_epi64(row4l, -16); \ + row4h = _mm_roti_epi64(row4h, -16); \ + \ + row3l = _mm_add_epi64(row3l, row4l); \ + row3h = _mm_add_epi64(row3h, row4h); \ + \ + row2l = _mm_xor_si128(row2l, row3l); \ + row2h = _mm_xor_si128(row2h, row3h); \ + \ + row2l = _mm_roti_epi64(row2l, -63); \ + row2h = _mm_roti_epi64(row2h, -63); \ + +#if defined(HAVE_SSSE3) +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2h, row2l, 8); \ + t1 = _mm_alignr_epi8(row2l, row2h, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4h, row4l, 8); \ + t1 = _mm_alignr_epi8(row4l, row4h, 8); \ + row4l = t1; \ + row4h = t0; + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = _mm_alignr_epi8(row2l, row2h, 8); \ + t1 = _mm_alignr_epi8(row2h, row2l, 8); \ + row2l = t0; \ + row2h = t1; \ + \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + \ + t0 = _mm_alignr_epi8(row4l, row4h, 8); \ + t1 = _mm_alignr_epi8(row4h, row4l, 8); \ + row4l = t1; \ + row4h = t0; +#else + +#define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row4l;\ + t1 = row2l;\ + row4l = row3l;\ + row3l = row3h;\ + row3h = row4l;\ + row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \ + row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \ + row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \ + row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1)) + +#define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \ + t0 = row3l;\ + row3l = row3h;\ + row3h = t0;\ + t0 = row2l;\ + t1 = row4l;\ + row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \ + row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \ + row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \ + row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1)) + +#endif +#if defined(HAVE_SSE41) +/*==============================================================*/ +/* --- blake2b-load-sse41.h */ + +#define LOAD_MSG_0_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_0_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_0_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_1_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_1_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_1_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_1_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) + + +#define LOAD_MSG_2_1(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m5, 8); \ +b1 = _mm_unpackhi_epi64(m2, m7); \ +} while(0) + + +#define LOAD_MSG_2_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m0); \ +b1 = _mm_blend_epi16(m1, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_2_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m5, m1, 0xF0); \ +b1 = _mm_unpackhi_epi64(m3, m4); \ +} while(0) + + +#define LOAD_MSG_2_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m3); \ +b1 = _mm_alignr_epi8(m2, m0, 8); \ +} while(0) + + +#define LOAD_MSG_3_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_unpackhi_epi64(m6, m5); \ +} while(0) + + +#define LOAD_MSG_3_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m0); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_3_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m2, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_3_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m5); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_4_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m2); \ +b1 = _mm_unpacklo_epi64(m1, m5); \ +} while(0) + + +#define LOAD_MSG_4_2(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m0, m3, 0xF0); \ +b1 = _mm_blend_epi16(m2, m7, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m7, m5, 0xF0); \ +b1 = _mm_blend_epi16(m3, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_4_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m6, m0, 8); \ +b1 = _mm_blend_epi16(m4, m6, 0xF0); \ +} while(0) + + +#define LOAD_MSG_5_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m3); \ +b1 = _mm_unpacklo_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_5_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m5); \ +b1 = _mm_unpackhi_epi64(m5, m1); \ +} while(0) + + +#define LOAD_MSG_5_3(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m2, m3, 0xF0); \ +b1 = _mm_unpackhi_epi64(m7, m0); \ +} while(0) + + +#define LOAD_MSG_5_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m2); \ +b1 = _mm_blend_epi16(m7, m4, 0xF0); \ +} while(0) + + +#define LOAD_MSG_6_1(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m6, m0, 0xF0); \ +b1 = _mm_unpacklo_epi64(m7, m2); \ +} while(0) + + +#define LOAD_MSG_6_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_alignr_epi8(m5, m6, 8); \ +} while(0) + + +#define LOAD_MSG_6_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m3); \ +b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2)); \ +} while(0) + + +#define LOAD_MSG_6_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m3, m1); \ +b1 = _mm_blend_epi16(m1, m5, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m6, m3); \ +b1 = _mm_blend_epi16(m6, m1, 0xF0); \ +} while(0) + + +#define LOAD_MSG_7_2(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpackhi_epi64(m0, m4); \ +} while(0) + + +#define LOAD_MSG_7_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m2, m7); \ +b1 = _mm_unpacklo_epi64(m4, m1); \ +} while(0) + + +#define LOAD_MSG_7_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m2); \ +b1 = _mm_unpacklo_epi64(m3, m5); \ +} while(0) + + +#define LOAD_MSG_8_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m3, m7); \ +b1 = _mm_alignr_epi8(m0, m5, 8); \ +} while(0) + + +#define LOAD_MSG_8_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_alignr_epi8(m4, m1, 8); \ +} while(0) + + +#define LOAD_MSG_8_3(b0, b1) \ +do \ +{ \ +b0 = m6; \ +b1 = _mm_alignr_epi8(m5, m0, 8); \ +} while(0) + + +#define LOAD_MSG_8_4(b0, b1) \ +do \ +{ \ +b0 = _mm_blend_epi16(m1, m3, 0xF0); \ +b1 = m2; \ +} while(0) + + +#define LOAD_MSG_9_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_unpackhi_epi64(m3, m0); \ +} while(0) + + +#define LOAD_MSG_9_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m1, m2); \ +b1 = _mm_blend_epi16(m3, m2, 0xF0); \ +} while(0) + + +#define LOAD_MSG_9_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m7, m4); \ +b1 = _mm_unpackhi_epi64(m1, m6); \ +} while(0) + + +#define LOAD_MSG_9_4(b0, b1) \ +do \ +{ \ +b0 = _mm_alignr_epi8(m7, m5, 8); \ +b1 = _mm_unpacklo_epi64(m6, m0); \ +} while(0) + + +#define LOAD_MSG_10_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m0, m1); \ +b1 = _mm_unpacklo_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m0, m1); \ +b1 = _mm_unpackhi_epi64(m2, m3); \ +} while(0) + + +#define LOAD_MSG_10_3(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m4, m5); \ +b1 = _mm_unpacklo_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_10_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpackhi_epi64(m4, m5); \ +b1 = _mm_unpackhi_epi64(m6, m7); \ +} while(0) + + +#define LOAD_MSG_11_1(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m7, m2); \ +b1 = _mm_unpackhi_epi64(m4, m6); \ +} while(0) + + +#define LOAD_MSG_11_2(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m5, m4); \ +b1 = _mm_alignr_epi8(m3, m7, 8); \ +} while(0) + + +#define LOAD_MSG_11_3(b0, b1) \ +do \ +{ \ +b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2)); \ +b1 = _mm_unpackhi_epi64(m5, m2); \ +} while(0) + + +#define LOAD_MSG_11_4(b0, b1) \ +do \ +{ \ +b0 = _mm_unpacklo_epi64(m6, m1); \ +b1 = _mm_unpackhi_epi64(m3, m1); \ +} while(0) +/*==============================================================*/ +#else /* defined(HAVE_SSE41) */ +/*==============================================================*/ +/* --- blake2b-load-sse2.h */ + +#define LOAD_MSG_0_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_0_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_0_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_0_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_1_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_1_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_1_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_1_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) +#define LOAD_MSG_2_1(b0, b1) b0 = _mm_set_epi64x(m12, m11); b1 = _mm_set_epi64x(m15, m5) +#define LOAD_MSG_2_2(b0, b1) b0 = _mm_set_epi64x(m0, m8); b1 = _mm_set_epi64x(m13, m2) +#define LOAD_MSG_2_3(b0, b1) b0 = _mm_set_epi64x(m3, m10); b1 = _mm_set_epi64x(m9, m7) +#define LOAD_MSG_2_4(b0, b1) b0 = _mm_set_epi64x(m6, m14); b1 = _mm_set_epi64x(m4, m1) +#define LOAD_MSG_3_1(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m13) +#define LOAD_MSG_3_2(b0, b1) b0 = _mm_set_epi64x(m1, m9); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_3_3(b0, b1) b0 = _mm_set_epi64x(m5, m2); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_3_4(b0, b1) b0 = _mm_set_epi64x(m10, m6); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_4_1(b0, b1) b0 = _mm_set_epi64x(m5, m9); b1 = _mm_set_epi64x(m10, m2) +#define LOAD_MSG_4_2(b0, b1) b0 = _mm_set_epi64x(m7, m0); b1 = _mm_set_epi64x(m15, m4) +#define LOAD_MSG_4_3(b0, b1) b0 = _mm_set_epi64x(m11, m14); b1 = _mm_set_epi64x(m3, m6) +#define LOAD_MSG_4_4(b0, b1) b0 = _mm_set_epi64x(m12, m1); b1 = _mm_set_epi64x(m13, m8) +#define LOAD_MSG_5_1(b0, b1) b0 = _mm_set_epi64x(m6, m2); b1 = _mm_set_epi64x(m8, m0) +#define LOAD_MSG_5_2(b0, b1) b0 = _mm_set_epi64x(m10, m12); b1 = _mm_set_epi64x(m3, m11) +#define LOAD_MSG_5_3(b0, b1) b0 = _mm_set_epi64x(m7, m4); b1 = _mm_set_epi64x(m1, m15) +#define LOAD_MSG_5_4(b0, b1) b0 = _mm_set_epi64x(m5, m13); b1 = _mm_set_epi64x(m9, m14) +#define LOAD_MSG_6_1(b0, b1) b0 = _mm_set_epi64x(m1, m12); b1 = _mm_set_epi64x(m4, m14) +#define LOAD_MSG_6_2(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m10, m13) +#define LOAD_MSG_6_3(b0, b1) b0 = _mm_set_epi64x(m6, m0); b1 = _mm_set_epi64x(m8, m9) +#define LOAD_MSG_6_4(b0, b1) b0 = _mm_set_epi64x(m3, m7); b1 = _mm_set_epi64x(m11, m2) +#define LOAD_MSG_7_1(b0, b1) b0 = _mm_set_epi64x(m7, m13); b1 = _mm_set_epi64x(m3, m12) +#define LOAD_MSG_7_2(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m9, m1) +#define LOAD_MSG_7_3(b0, b1) b0 = _mm_set_epi64x(m15, m5); b1 = _mm_set_epi64x(m2, m8) +#define LOAD_MSG_7_4(b0, b1) b0 = _mm_set_epi64x(m4, m0); b1 = _mm_set_epi64x(m10, m6) +#define LOAD_MSG_8_1(b0, b1) b0 = _mm_set_epi64x(m14, m6); b1 = _mm_set_epi64x(m0, m11) +#define LOAD_MSG_8_2(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m8, m3) +#define LOAD_MSG_8_3(b0, b1) b0 = _mm_set_epi64x(m13, m12); b1 = _mm_set_epi64x(m10, m1) +#define LOAD_MSG_8_4(b0, b1) b0 = _mm_set_epi64x(m7, m2); b1 = _mm_set_epi64x(m5, m4) +#define LOAD_MSG_9_1(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m1, m7) +#define LOAD_MSG_9_2(b0, b1) b0 = _mm_set_epi64x(m4, m2); b1 = _mm_set_epi64x(m5, m6) +#define LOAD_MSG_9_3(b0, b1) b0 = _mm_set_epi64x(m9, m15); b1 = _mm_set_epi64x(m13, m3) +#define LOAD_MSG_9_4(b0, b1) b0 = _mm_set_epi64x(m14, m11); b1 = _mm_set_epi64x(m0, m12) +#define LOAD_MSG_10_1(b0, b1) b0 = _mm_set_epi64x(m2, m0); b1 = _mm_set_epi64x(m6, m4) +#define LOAD_MSG_10_2(b0, b1) b0 = _mm_set_epi64x(m3, m1); b1 = _mm_set_epi64x(m7, m5) +#define LOAD_MSG_10_3(b0, b1) b0 = _mm_set_epi64x(m10, m8); b1 = _mm_set_epi64x(m14, m12) +#define LOAD_MSG_10_4(b0, b1) b0 = _mm_set_epi64x(m11, m9); b1 = _mm_set_epi64x(m15, m13) +#define LOAD_MSG_11_1(b0, b1) b0 = _mm_set_epi64x(m4, m14); b1 = _mm_set_epi64x(m13, m9) +#define LOAD_MSG_11_2(b0, b1) b0 = _mm_set_epi64x(m8, m10); b1 = _mm_set_epi64x(m6, m15) +#define LOAD_MSG_11_3(b0, b1) b0 = _mm_set_epi64x(m0, m1); b1 = _mm_set_epi64x(m5, m11) +#define LOAD_MSG_11_4(b0, b1) b0 = _mm_set_epi64x(m2, m12); b1 = _mm_set_epi64x(m3, m7) +/*==============================================================*/ +#endif /* defined(HAVE_SSE41) */ + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \ + UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); +/*==============================================================*/ +#else /* XXXSSE */ +/*==============================================================*/ static const uint8_t blake2b_sigma[12][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, @@ -40,6 +687,39 @@ {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3} }; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2b_sigma[r][2*i+0]]; \ + d = rotr64(d ^ a, 32); \ + c = c + d; \ + b = rotr64(b ^ c, 24); \ + a = a + b + m[blake2b_sigma[r][2*i+1]]; \ + d = rotr64(d ^ a, 16); \ + c = c + d; \ + b = rotr64(b ^ c, 63); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) +/*==============================================================*/ +#endif /* XXXSSE */ + +ALIGN(64) +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, + 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + static inline int blake2b_set_lastnode(blake2b_state * S) { @@ -47,13 +727,13 @@ return 0; } +/* Some helper functions, not necessarily useful */ static inline int blake2b_clear_lastnode(blake2b_state * S) { S->f[1] = 0ULL; return 0; } -/* Some helper functions, not necessarily useful */ static inline int blake2b_set_lastblock(blake2b_state * S) { if (S->last_node) @@ -75,8 +755,21 @@ static inline int blake2b_increment_counter(blake2b_state * S, const uint64_t inc) { +#ifdef XXXSSE +#if __x86_64__ + // ADD/ADC chain + __uint128_t t = ((__uint128_t) S->t[1] << 64) | S->t[0]; + t += inc; + S->t[0] = (uint64_t) (t >> 0); + S->t[1] = (uint64_t) (t >> 64); +#else + S->t[0] += inc; + S->t[1] += (S->t[0] < inc); +#endif +#else /* XXXSSE */ S->t[0] += inc; S->t[1] += (S->t[0] < inc); +#endif /* XXXSSE */ return 0; } @@ -165,6 +858,18 @@ /* init xors IV with input parameter block */ int blake2b_init_param(blake2b_state * S, const blake2b_param * P) { +#ifdef XXXSSE + uint8_t *p, *h, *v; + //blake2b_init0( S ); + v = (uint8_t *) (blake2b_IV); + h = (uint8_t *) (S->h); + p = (uint8_t *) (P); + /* IV XOR ParamBlock */ + memset(S, 0, sizeof(blake2b_state)); + + for (int i = 0; i < BLAKE2B_OUTBYTES; ++i) + h[i] = v[i] ^ p[i]; +#else /* XXXSSE */ blake2b_init0(S); uint8_t *p = (uint8_t *) (P); size_t i; @@ -172,6 +877,7 @@ /* IV XOR ParamBlock */ for (i = 0; i < 8; ++i) S->h[i] ^= load64(p + sizeof(S->h[i]) * i); +#endif /* XXXSSE */ return 0; } @@ -185,6 +891,23 @@ if (outlen == 0 || outlen > BLAKE2B_OUTBYTES) return -1; +#ifdef XXXSSE + { const blake2b_param _P = { + .digest_length = outlen, + .key_length = 0, + .fanout = 1, + .depth = 1, + .leaf_length = 0, + .node_offset = 0, + .node_depth = 0, + .inner_length = 0, + .reserved = {0}, + .salt = {0}, + .personal = {0} + }; + P[0] = *(blake2b_param *)&_P; /* structure assignment */ + } +#else /* XXXSSE */ P->digest_length = outlen; P->key_length = 0; P->fanout = 1; @@ -193,9 +916,11 @@ store64(&P->node_offset, 0); P->node_depth = 0; P->inner_length = 0; - memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->reserved, 0, sizeof(P->reserved) ); memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); +#endif /* XXXSSE */ + return blake2b_init_param(S, P); } @@ -211,6 +936,23 @@ if (!key || !keylen || keylen > BLAKE2B_KEYBYTES) return -1; +#ifdef XXXSSE + { const blake2b_param _P = { + .digest_length = outlen, + .key_length = keylen, + .fanout = 1, + .depth = 1, + .leaf_length = 0, + .node_offset = 0, + .node_depth = 0, + .inner_length = 0, + .reserved = {0}, + .salt = {0}, + .personal = {0} + }; + P[0] = *(blake2b_param *)&_P; /* structure assignment */ + } +#else /* XXXSSE */ P->digest_length = outlen; P->key_length = keylen; P->fanout = 1; @@ -219,9 +961,10 @@ store64(&P->node_offset, 0); P->node_depth = 0; P->inner_length = 0; - memset(P->reserved, 0, sizeof(P->reserved)); + memset(P->reserved, 0, sizeof(P->reserved) ); memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); +#endif /* XXXSSE */ if (blake2b_init_param(S, P) < 0) return -1; @@ -236,9 +979,60 @@ return 0; } -static int blake2b_compress(blake2b_state * S, +static inline int blake2b_compress(blake2b_state * S, const uint8_t block[BLAKE2B_BLOCKBYTES]) { +#ifdef XXXSSE + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r16 = + _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, + 9); + const __m128i r24 = + _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, + 10); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU(block + 00); + const __m128i m1 = LOADU(block + 16); + const __m128i m2 = LOADU(block + 32); + const __m128i m3 = LOADU(block + 48); + const __m128i m4 = LOADU(block + 64); + const __m128i m5 = LOADU(block + 80); + const __m128i m6 = LOADU(block + 96); + const __m128i m7 = LOADU(block + 112); +#else + const uint64_t m0 = ((uint64_t *) block)[0]; + const uint64_t m1 = ((uint64_t *) block)[1]; + const uint64_t m2 = ((uint64_t *) block)[2]; + const uint64_t m3 = ((uint64_t *) block)[3]; + const uint64_t m4 = ((uint64_t *) block)[4]; + const uint64_t m5 = ((uint64_t *) block)[5]; + const uint64_t m6 = ((uint64_t *) block)[6]; + const uint64_t m7 = ((uint64_t *) block)[7]; + const uint64_t m8 = ((uint64_t *) block)[8]; + const uint64_t m9 = ((uint64_t *) block)[9]; + const uint64_t m10 = ((uint64_t *) block)[10]; + const uint64_t m11 = ((uint64_t *) block)[11]; + const uint64_t m12 = ((uint64_t *) block)[12]; + const uint64_t m13 = ((uint64_t *) block)[13]; + const uint64_t m14 = ((uint64_t *) block)[14]; + const uint64_t m15 = ((uint64_t *) block)[15]; +#endif + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); +#else /* XXXSSE */ uint64_t m[16]; uint64_t v[16]; int i; @@ -257,28 +1051,8 @@ v[13] = S->t[1] ^ blake2b_IV[5]; v[14] = S->f[0] ^ blake2b_IV[6]; v[15] = S->f[1] ^ blake2b_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2b_sigma[r][2*i+0]]; \ - d = rotr64(d ^ a, 32); \ - c = c + d; \ - b = rotr64(b ^ c, 24); \ - a = a + b + m[blake2b_sigma[r][2*i+1]]; \ - d = rotr64(d ^ a, 16); \ - c = c + d; \ - b = rotr64(b ^ c, 63); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) +#endif /* XXXSSE */ + ROUND(0); ROUND(1); ROUND(2); @@ -292,15 +1066,23 @@ ROUND(10); ROUND(11); +#ifdef XXXSSE + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); +#else /* XXXSSE */ for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; +#endif /* XXXSSE */ -#undef G -#undef ROUND return 0; } -/* inlen now in bytes */ int blake2b_update(blake2b_state * S, const uint8_t * in, uint64_t inlen) { while (inlen > 0) { @@ -328,11 +1110,8 @@ return 0; } -/* Is this correct? */ int blake2b_final(blake2b_state * S, uint8_t * out, uint8_t outlen) { - uint8_t buffer[BLAKE2B_OUTBYTES]; - int i; if (S->buflen > BLAKE2B_BLOCKBYTES) { blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES); @@ -346,10 +1125,17 @@ memset(S->buf + S->buflen, 0, 2 * BLAKE2B_BLOCKBYTES - S->buflen); /* Padding */ blake2b_compress(S, S->buf); - for (i = 0; i < 8; ++i) /* Output full hash to temp buffer */ - store64(buffer + sizeof(S->h[i]) * i, S->h[i]); +#ifdef XXXSSE + memcpy( out, &S->h[0], outlen ); +#else /* XXXSSE */ + { uint8_t buffer[BLAKE2B_OUTBYTES]; + int i; + for (i = 0; i < 8; ++i) /* Output full hash to temp buffer */ + store64(buffer + sizeof(S->h[i]) * i, S->h[i]); - memcpy(out, buffer, outlen); + memcpy(out, buffer, outlen); + } +#endif /* XXXSSE */ return 0; } @@ -382,6 +1168,16 @@ return 0; } + +#if defined(SUPERCOP) +int crypto_hash(unsigned char *out, unsigned char *in, + unsigned long long inlen) +{ + return blake2b(out, in, NULL, BLAKE2B_OUTBYTES, inlen, 0); +} +#endif + + #if defined(BLAKE2B_SELFTEST) #include "blake2-kat.h" int main(int argc, char **argv) @@ -398,9 +1194,9 @@ for (i = 0; i < KAT_LENGTH; ++i) { uint8_t hash[BLAKE2B_OUTBYTES]; - blake2b(hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES); - if (memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) { + if (blake2b(hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES) < 0 + || memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) { puts("error"); return -1; } @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/blake2bp.c ============================================================================ $ cvs diff -u -r1.1.2.4 -r1.1.2.5 blake2bp.c --- rpm/rpmio/blake2bp.c 27 Mar 2014 19:14:04 -0000 1.1.2.4 +++ rpm/rpmio/blake2bp.c 9 Oct 2014 02:39:34 -0000 1.1.2.5 @@ -35,7 +35,7 @@ store32(&P->leaf_length, 0); store64(&P->node_offset, offset); P->node_depth = 0; - P->inner_length = outlen; + P->inner_length = BLAKE2B_OUTBYTES; memset(P->reserved, 0, sizeof(P->reserved)); memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); @@ -53,7 +53,7 @@ store32(&P->leaf_length, 0); store64(&P->node_offset, 0); P->node_depth = 1; - P->inner_length = outlen; + P->inner_length = BLAKE2B_OUTBYTES; memset(P->reserved, 0, sizeof(P->reserved)); memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); @@ -289,9 +289,9 @@ for (i = 0; i < KAT_LENGTH; ++i) { uint8_t hash[BLAKE2B_OUTBYTES]; - blake2bp(hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES); - if (memcmp(hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES)) { + if (blake2bp(hash, buf, key, BLAKE2B_OUTBYTES, i, BLAKE2B_KEYBYTES) < 0 + || memcmp(hash, blake2bp_keyed_kat[i], BLAKE2B_OUTBYTES)) { puts("error"); return -1; } @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/blake2s.c ============================================================================ $ cvs diff -u -r1.1.2.2 -r1.1.2.3 blake2s.c --- rpm/rpmio/blake2s.c 31 Oct 2013 20:45:44 -0000 1.1.2.2 +++ rpm/rpmio/blake2s.c 9 Oct 2014 02:39:54 -0000 1.1.2.3 @@ -10,19 +10,596 @@ You should have received a copy of the CC0 Public Domain Dedication along with this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>. */ +#define XXXSSE #include "system.h" #include "blake2.h" #include "blake2-impl.h" +#ifdef XXXSSE /* compiler intrinsic defines may not be portable */ +#if defined(__SSE2__) +#define HAVE_SSE2 +#endif + +#if defined(__SSSE3__) +#define HAVE_SSSE3 +#endif + +#if defined(__SSE4_1__) +#define HAVE_SSE41 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__XOP__) +#define HAVE_XOP +#endif + + +#ifdef HAVE_AVX2 +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_XOP +#ifndef HAVE_AVX +#define HAVE_AVX +#endif +#endif + +#ifdef HAVE_AVX +#ifndef HAVE_SSE41 +#define HAVE_SSE41 +#endif +#endif + +#ifdef HAVE_SSE41 +#ifndef HAVE_SSSE3 +#define HAVE_SSSE3 +#endif +#endif + +#ifdef HAVE_SSSE3 +#define HAVE_SSE2 +#endif + +#if !defined(HAVE_SSE2) +#error "This code requires at least SSE2." +#endif +#include <emmintrin.h> +#if defined(HAVE_SSSE3) +#include <tmmintrin.h> +#endif +#if defined(HAVE_SSE41) +#include <smmintrin.h> +#endif +#if defined(HAVE_AVX) +#include <immintrin.h> +#endif +#if defined(HAVE_XOP) +#include <x86intrin.h> +#endif /* XXXSSE */ + +#endif + #include "debug.h" -static const uint32_t blake2s_IV[8] = { - 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, - 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL -}; +#ifdef XXXSSE +/*==============================================================*/ +/* --- blake2s-round.h */ + +#define LOAD(p) _mm_load_si128( (__m128i *)(p) ) +#define STORE(p,r) _mm_store_si128((__m128i *)(p), r) + +#define LOADU(p) _mm_loadu_si128( (__m128i *)(p) ) +#define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define TOF(reg) _mm_castsi128_ps((reg)) +#define TOI(reg) _mm_castps_si128((reg)) + +#define LIKELY(x) __builtin_expect((x),1) + + +/* Microarchitecture-specific macros */ +#ifndef HAVE_XOP +#ifdef HAVE_SSSE3 +#define _mm_roti_epi32(r, c) ( \ + (8==-(c)) ? _mm_shuffle_epi8(r,r8) \ + : (16==-(c)) ? _mm_shuffle_epi8(r,r16) \ + : _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-(c)) )) ) +#else +#define _mm_roti_epi32(r, c) _mm_xor_si128(_mm_srli_epi32( (r), -(c) ),_mm_slli_epi32( (r), 32-(-c) )) +#endif +#else +/* ... */ +#endif + +#define G1(row1,row2,row3,row4,buf) \ + row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = _mm_roti_epi32(row4, -16); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ + row2 = _mm_roti_epi32(row2, -12); + +#define G2(row1,row2,row3,row4,buf) \ + row1 = _mm_add_epi32( _mm_add_epi32( row1, buf), row2 ); \ + row4 = _mm_xor_si128( row4, row1 ); \ + row4 = _mm_roti_epi32(row4, -8); \ + row3 = _mm_add_epi32( row3, row4 ); \ + row2 = _mm_xor_si128( row2, row3 ); \ + row2 = _mm_roti_epi32(row2, -7); + +#define DIAGONALIZE(row1,row2,row3,row4) \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ + row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); + +#define UNDIAGONALIZE(row1,row2,row3,row4) \ + row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ + row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ + row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); + +#if defined(HAVE_XOP) +/*==============================================================*/ +/* --- blake2s-load-xop.h */ + +#define TOB(x) ((x)*4*0x01010101 + 0x03020100) // ..or not TOB + +/* Basic VPPERM emulation, for testing purposes */ +/*static __m128i _mm_perm_epi8(const __m128i src1, const __m128i src2, const __m128i sel) +{ + const __m128i sixteen = _mm_set1_epi8(16); + const __m128i t0 = _mm_shuffle_epi8(src1, sel); + const __m128i s1 = _mm_shuffle_epi8(src2, _mm_sub_epi8(sel, sixteen)); + const __m128i mask = _mm_or_si128(_mm_cmpeq_epi8(sel, sixteen), + _mm_cmpgt_epi8(sel, sixteen)); // (>=16) = 0xff : 00 + return _mm_blendv_epi8(t0, s1, mask); +}*/ + +#define LOAD_MSG_0_1(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); + +#define LOAD_MSG_0_2(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); + +#define LOAD_MSG_0_3(buf) \ +buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(2),TOB(0)) ); + +#define LOAD_MSG_0_4(buf) \ +buf = _mm_perm_epi8(m2, m3, _mm_set_epi32(TOB(7),TOB(5),TOB(3),TOB(1)) ); + +#define LOAD_MSG_1_1(buf) \ +t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(5),TOB(0),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_1_2(buf) \ +t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(2),TOB(0),TOB(4),TOB(6)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); + +#define LOAD_MSG_1_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(0),TOB(0),TOB(1)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); + +#define LOAD_MSG_1_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(7),TOB(2),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); + +#define LOAD_MSG_2_1(buf) \ +t0 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(0),TOB(1),TOB(0),TOB(7)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(4),TOB(0)) ); + +#define LOAD_MSG_2_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(2),TOB(0),TOB(4)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_2_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(7),TOB(3),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_2_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(1),TOB(6),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_3_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(3),TOB(7)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(1),TOB(0)) ); + +#define LOAD_MSG_3_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(1),TOB(5)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(6),TOB(4),TOB(1),TOB(0)) ); + +#define LOAD_MSG_3_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(5),TOB(2)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_3_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(6),TOB(0)) ); + +#define LOAD_MSG_4_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(5),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(5)) ); + +#define LOAD_MSG_4_2(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(4),TOB(7),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_4_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(6),TOB(0),TOB(0)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(6)) ); + +#define LOAD_MSG_4_4(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(4),TOB(0),TOB(1)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(4),TOB(0)) ); + +#define LOAD_MSG_5_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(2)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_5_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(6),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(4)) ); + +#define LOAD_MSG_5_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(0),TOB(7),TOB(4)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); + +#define LOAD_MSG_5_4(buf) \ +t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(5),TOB(0),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(5)) ); + +#define LOAD_MSG_6_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(4),TOB(0),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(6),TOB(1),TOB(4)) ); + +#define LOAD_MSG_6_2(buf) \ +t1 = _mm_perm_epi8(m1, m2, _mm_set_epi32(TOB(6),TOB(0),TOB(0),TOB(1)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(5),TOB(7),TOB(0)) ); + +#define LOAD_MSG_6_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(6),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(4),TOB(5),TOB(1),TOB(0)) ); + +#define LOAD_MSG_6_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(2),TOB(3),TOB(7)) ); \ +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(7),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_7_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(3),TOB(0),TOB(7),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(5)) ); + +#define LOAD_MSG_7_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(5),TOB(1),TOB(0),TOB(7)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); + +#define LOAD_MSG_7_3(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(2),TOB(0),TOB(0),TOB(5)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(4),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(7),TOB(0)) ); + +#define LOAD_MSG_7_4(buf) \ +t1 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(6),TOB(4),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m2, _mm_set_epi32(TOB(6),TOB(2),TOB(1),TOB(0)) ); + +#define LOAD_MSG_8_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(6)) ); \ +t0 = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(7),TOB(1),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(6),TOB(0)) ); + +#define LOAD_MSG_8_2(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(4),TOB(3),TOB(5),TOB(0)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(1),TOB(7)) ); + +#define LOAD_MSG_8_3(buf) \ +t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(6),TOB(1),TOB(0),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(3),TOB(2),TOB(5),TOB(4)) ); \ + +#define LOAD_MSG_8_4(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(4),TOB(7),TOB(2)) ); + +#define LOAD_MSG_9_1(buf) \ +t0 = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(1),TOB(7),TOB(0),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m2, _mm_set_epi32(TOB(3),TOB(2),TOB(4),TOB(6)) ); + +#define LOAD_MSG_9_2(buf) \ +buf = _mm_perm_epi8(m0, m1, _mm_set_epi32(TOB(5),TOB(6),TOB(4),TOB(2)) ); + +#define LOAD_MSG_9_3(buf) \ +t0 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(3),TOB(5),TOB(0)) ); \ +buf = _mm_perm_epi8(t0, m3, _mm_set_epi32(TOB(5),TOB(2),TOB(1),TOB(7)) ); + +#define LOAD_MSG_9_4(buf) \ +t1 = _mm_perm_epi8(m0, m2, _mm_set_epi32(TOB(0),TOB(0),TOB(0),TOB(7)) ); \ +buf = _mm_perm_epi8(t1, m3, _mm_set_epi32(TOB(3),TOB(4),TOB(6),TOB(0)) ); + +/*==============================================================*/ +#elif defined(HAVE_SSE41) +/*==============================================================*/ +/* --- blake2s-load-sse41.h */ + +#define LOAD_MSG_0_1(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(2,0,2,0))); + +#define LOAD_MSG_0_2(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m0), TOF(m1), _MM_SHUFFLE(3,1,3,1))); + +#define LOAD_MSG_0_3(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(2,0,2,0))); + +#define LOAD_MSG_0_4(buf) \ +buf = TOI(_mm_shuffle_ps(TOF(m2), TOF(m3), _MM_SHUFFLE(3,1,3,1))); + +#define LOAD_MSG_1_1(buf) \ +t0 = _mm_blend_epi16(m1, m2, 0x0C); \ +t1 = _mm_slli_si128(m3, 4); \ +t2 = _mm_blend_epi16(t0, t1, 0xF0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3)); + +#define LOAD_MSG_1_2(buf) \ +t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0)); \ +t1 = _mm_blend_epi16(m1,m3,0xC0); \ +t2 = _mm_blend_epi16(t0, t1, 0xF0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + +#define LOAD_MSG_1_3(buf) \ +t0 = _mm_slli_si128(m1, 4); \ +t1 = _mm_blend_epi16(m2, t0, 0x30); \ +t2 = _mm_blend_epi16(m0, t1, 0xF0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + +#define LOAD_MSG_1_4(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_slli_si128(m3, 4); \ +t2 = _mm_blend_epi16(t0, t1, 0x0C); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1)); + +#define LOAD_MSG_2_1(buf) \ +t0 = _mm_unpackhi_epi32(m2,m3); \ +t1 = _mm_blend_epi16(m3,m1,0x0C); \ +t2 = _mm_blend_epi16(t0, t1, 0x0F); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + +#define LOAD_MSG_2_2(buf) \ +t0 = _mm_unpacklo_epi32(m2,m0); \ +t1 = _mm_blend_epi16(t0, m0, 0xF0); \ +t2 = _mm_slli_si128(m3, 8); \ +buf = _mm_blend_epi16(t1, t2, 0xC0); + +#define LOAD_MSG_2_3(buf) \ +t0 = _mm_blend_epi16(m0, m2, 0x3C); \ +t1 = _mm_srli_si128(m1, 12); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2)); + +#define LOAD_MSG_2_4(buf) \ +t0 = _mm_slli_si128(m3, 4); \ +t1 = _mm_blend_epi16(m0, m1, 0x33); \ +t2 = _mm_blend_epi16(t1, t0, 0xC0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3)); + +#define LOAD_MSG_3_1(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_unpackhi_epi32(t0, m2); \ +t2 = _mm_blend_epi16(t1, m3, 0x0C); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2)); + +#define LOAD_MSG_3_2(buf) \ +t0 = _mm_slli_si128(m2, 8); \ +t1 = _mm_blend_epi16(m3,m0,0x0C); \ +t2 = _mm_blend_epi16(t1, t0, 0xC0); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + +#define LOAD_MSG_3_3(buf) \ +t0 = _mm_blend_epi16(m0,m1,0x0F); \ +t1 = _mm_blend_epi16(t0, m3, 0xC0); \ +buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + +#define LOAD_MSG_3_4(buf) \ +t0 = _mm_unpacklo_epi32(m0,m2); \ +t1 = _mm_unpackhi_epi32(m1,m2); \ +buf = _mm_unpacklo_epi64(t1,t0); + +#define LOAD_MSG_4_1(buf) \ +t0 = _mm_unpacklo_epi64(m1,m2); \ +t1 = _mm_unpackhi_epi64(m0,m2); \ +t2 = _mm_blend_epi16(t0,t1,0x33); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3)); + +#define LOAD_MSG_4_2(buf) \ +t0 = _mm_unpackhi_epi64(m1,m3); \ +t1 = _mm_unpacklo_epi64(m0,m1); \ +buf = _mm_blend_epi16(t0,t1,0x33); + +#define LOAD_MSG_4_3(buf) \ +t0 = _mm_unpackhi_epi64(m3,m1); \ +t1 = _mm_unpackhi_epi64(m2,m0); \ +buf = _mm_blend_epi16(t1,t0,0x33); + +#define LOAD_MSG_4_4(buf) \ +t0 = _mm_blend_epi16(m0,m2,0x03); \ +t1 = _mm_slli_si128(t0, 8); \ +t2 = _mm_blend_epi16(t1,m3,0x0F); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3)); + +#define LOAD_MSG_5_1(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_unpacklo_epi32(m0,m2); \ +buf = _mm_unpacklo_epi64(t0,t1); + +#define LOAD_MSG_5_2(buf) \ +t0 = _mm_srli_si128(m2, 4); \ +t1 = _mm_blend_epi16(m0,m3,0x03); \ +buf = _mm_blend_epi16(t1,t0,0x3C); + +#define LOAD_MSG_5_3(buf) \ +t0 = _mm_blend_epi16(m1,m0,0x0C); \ +t1 = _mm_srli_si128(m3, 4); \ +t2 = _mm_blend_epi16(t0,t1,0x30); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0)); + +#define LOAD_MSG_5_4(buf) \ +t0 = _mm_unpacklo_epi64(m1,m2); \ +t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1)); \ +buf = _mm_blend_epi16(t0,t1,0x33); + +#define LOAD_MSG_6_1(buf) \ +t0 = _mm_slli_si128(m1, 12); \ +t1 = _mm_blend_epi16(m0,m3,0x33); \ +buf = _mm_blend_epi16(t1,t0,0xC0); + +#define LOAD_MSG_6_2(buf) \ +t0 = _mm_blend_epi16(m3,m2,0x30); \ +t1 = _mm_srli_si128(m1, 4); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0)); + +#define LOAD_MSG_6_3(buf) \ +t0 = _mm_unpacklo_epi64(m0,m2); \ +t1 = _mm_srli_si128(m1, 4); \ +buf = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0)); + +#define LOAD_MSG_6_4(buf) \ +t0 = _mm_unpackhi_epi32(m1,m2); \ +t1 = _mm_unpackhi_epi64(m0,t0); \ +buf = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2)); + +#define LOAD_MSG_7_1(buf) \ +t0 = _mm_unpackhi_epi32(m0,m1); \ +t1 = _mm_blend_epi16(t0,m3,0x0F); \ +buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1)); + +#define LOAD_MSG_7_2(buf) \ +t0 = _mm_blend_epi16(m2,m3,0x30); \ +t1 = _mm_srli_si128(m0,4); \ +t2 = _mm_blend_epi16(t0,t1,0x03); \ +buf = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3)); + +#define LOAD_MSG_7_3(buf) \ +t0 = _mm_unpackhi_epi64(m0,m3); \ +t1 = _mm_unpacklo_epi64(m1,m2); \ +t2 = _mm_blend_epi16(t0,t1,0x3C); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1)); + +#define LOAD_MSG_7_4(buf) \ +t0 = _mm_unpacklo_epi32(m0,m1); \ +t1 = _mm_unpackhi_epi32(m1,m2); \ +buf = _mm_unpacklo_epi64(t0,t1); + +#define LOAD_MSG_8_1(buf) \ +t0 = _mm_unpackhi_epi32(m1,m3); \ +t1 = _mm_unpacklo_epi64(t0,m0); \ +t2 = _mm_blend_epi16(t1,m2,0xC0); \ +buf = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2)); + +#define LOAD_MSG_8_2(buf) \ +t0 = _mm_unpackhi_epi32(m0,m3); \ +t1 = _mm_blend_epi16(m2,t0,0xF0); \ +buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3)); + +#define LOAD_MSG_8_3(buf) \ +t0 = _mm_blend_epi16(m2,m0,0x0C); \ +t1 = _mm_slli_si128(t0,4); \ +buf = _mm_blend_epi16(t1,m3,0x0F); + +#define LOAD_MSG_8_4(buf) \ +t0 = _mm_blend_epi16(m1,m0,0x30); \ +buf = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2)); + +#define LOAD_MSG_9_1(buf) \ +t0 = _mm_blend_epi16(m0,m2,0x03); \ +t1 = _mm_blend_epi16(m1,m2,0x30); \ +t2 = _mm_blend_epi16(t1,t0,0x0F); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2)); + +#define LOAD_MSG_9_2(buf) \ +t0 = _mm_slli_si128(m0,4); \ +t1 = _mm_blend_epi16(m1,t0,0xC0); \ +buf = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3)); + +#define LOAD_MSG_9_3(buf) \ +t0 = _mm_unpackhi_epi32(m0,m3); \ +t1 = _mm_unpacklo_epi32(m2,m3); \ +t2 = _mm_unpackhi_epi64(t0,t1); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1)); + +#define LOAD_MSG_9_4(buf) \ +t0 = _mm_blend_epi16(m3,m2,0xC0); \ +t1 = _mm_unpacklo_epi32(m0,m3); \ +t2 = _mm_blend_epi16(t0,t1,0x0F); \ +buf = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3)); + +/*==============================================================*/ +#else /* defined(HAVE_SSE41) */ +/*==============================================================*/ +/* --- blake2s-load-sse2.h */ + +#define LOAD_MSG_0_1(buf) buf = _mm_set_epi32(m6,m4,m2,m0) +#define LOAD_MSG_0_2(buf) buf = _mm_set_epi32(m7,m5,m3,m1) +#define LOAD_MSG_0_3(buf) buf = _mm_set_epi32(m14,m12,m10,m8) +#define LOAD_MSG_0_4(buf) buf = _mm_set_epi32(m15,m13,m11,m9) +#define LOAD_MSG_1_1(buf) buf = _mm_set_epi32(m13,m9,m4,m14) +#define LOAD_MSG_1_2(buf) buf = _mm_set_epi32(m6,m15,m8,m10) +#define LOAD_MSG_1_3(buf) buf = _mm_set_epi32(m5,m11,m0,m1) +#define LOAD_MSG_1_4(buf) buf = _mm_set_epi32(m3,m7,m2,m12) +#define LOAD_MSG_2_1(buf) buf = _mm_set_epi32(m15,m5,m12,m11) +#define LOAD_MSG_2_2(buf) buf = _mm_set_epi32(m13,m2,m0,m8) +#define LOAD_MSG_2_3(buf) buf = _mm_set_epi32(m9,m7,m3,m10) +#define LOAD_MSG_2_4(buf) buf = _mm_set_epi32(m4,m1,m6,m14) +#define LOAD_MSG_3_1(buf) buf = _mm_set_epi32(m11,m13,m3,m7) +#define LOAD_MSG_3_2(buf) buf = _mm_set_epi32(m14,m12,m1,m9) +#define LOAD_MSG_3_3(buf) buf = _mm_set_epi32(m15,m4,m5,m2) +#define LOAD_MSG_3_4(buf) buf = _mm_set_epi32(m8,m0,m10,m6) +#define LOAD_MSG_4_1(buf) buf = _mm_set_epi32(m10,m2,m5,m9) +#define LOAD_MSG_4_2(buf) buf = _mm_set_epi32(m15,m4,m7,m0) +#define LOAD_MSG_4_3(buf) buf = _mm_set_epi32(m3,m6,m11,m14) +#define LOAD_MSG_4_4(buf) buf = _mm_set_epi32(m13,m8,m12,m1) +#define LOAD_MSG_5_1(buf) buf = _mm_set_epi32(m8,m0,m6,m2) +#define LOAD_MSG_5_2(buf) buf = _mm_set_epi32(m3,m11,m10,m12) +#define LOAD_MSG_5_3(buf) buf = _mm_set_epi32(m1,m15,m7,m4) +#define LOAD_MSG_5_4(buf) buf = _mm_set_epi32(m9,m14,m5,m13) +#define LOAD_MSG_6_1(buf) buf = _mm_set_epi32(m4,m14,m1,m12) +#define LOAD_MSG_6_2(buf) buf = _mm_set_epi32(m10,m13,m15,m5) +#define LOAD_MSG_6_3(buf) buf = _mm_set_epi32(m8,m9,m6,m0) +#define LOAD_MSG_6_4(buf) buf = _mm_set_epi32(m11,m2,m3,m7) +#define LOAD_MSG_7_1(buf) buf = _mm_set_epi32(m3,m12,m7,m13) +#define LOAD_MSG_7_2(buf) buf = _mm_set_epi32(m9,m1,m14,m11) +#define LOAD_MSG_7_3(buf) buf = _mm_set_epi32(m2,m8,m15,m5) +#define LOAD_MSG_7_4(buf) buf = _mm_set_epi32(m10,m6,m4,m0) +#define LOAD_MSG_8_1(buf) buf = _mm_set_epi32(m0,m11,m14,m6) +#define LOAD_MSG_8_2(buf) buf = _mm_set_epi32(m8,m3,m9,m15) +#define LOAD_MSG_8_3(buf) buf = _mm_set_epi32(m10,m1,m13,m12) +#define LOAD_MSG_8_4(buf) buf = _mm_set_epi32(m5,m4,m7,m2) +#define LOAD_MSG_9_1(buf) buf = _mm_set_epi32(m1,m7,m8,m10) +#define LOAD_MSG_9_2(buf) buf = _mm_set_epi32(m5,m6,m4,m2) +#define LOAD_MSG_9_3(buf) buf = _mm_set_epi32(m13,m3,m9,m15) +#define LOAD_MSG_9_4(buf) buf = _mm_set_epi32(m0,m12,m14,m11) + +/*==============================================================*/ +#endif /* defined(HAVE_SSE41) */ + +#define ROUND(r) \ + LOAD_MSG_ ##r ##_1(buf1); \ + G1(row1,row2,row3,row4,buf1); \ + LOAD_MSG_ ##r ##_2(buf2); \ + G2(row1,row2,row3,row4,buf2); \ + DIAGONALIZE(row1,row2,row3,row4); \ + LOAD_MSG_ ##r ##_3(buf3); \ + G1(row1,row2,row3,row4,buf3); \ + LOAD_MSG_ ##r ##_4(buf4); \ + G2(row1,row2,row3,row4,buf4); \ + UNDIAGONALIZE(row1,row2,row3,row4); \ + +/*==============================================================*/ +#else /* XXXSSE */ +/*==============================================================*/ static const uint8_t blake2s_sigma[10][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, @@ -36,6 +613,36 @@ {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, }; +#define G(r,i,a,b,c,d) \ + do { \ + a = a + b + m[blake2s_sigma[r][2*i+0]]; \ + d = rotr32(d ^ a, 16); \ + c = c + d; \ + b = rotr32(b ^ c, 12); \ + a = a + b + m[blake2s_sigma[r][2*i+1]]; \ + d = rotr32(d ^ a, 8); \ + c = c + d; \ + b = rotr32(b ^ c, 7); \ + } while(0) +#define ROUND(r) \ + do { \ + G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ + G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ + G(r,2,v[ 2],v[ 6],v[10],v[14]); \ + G(r,3,v[ 3],v[ 7],v[11],v[15]); \ + G(r,4,v[ 0],v[ 5],v[10],v[15]); \ + G(r,5,v[ 1],v[ 6],v[11],v[12]); \ + G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ + G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ + } while(0) +/*==============================================================*/ +#endif /* XXXSSE */ + +static const uint32_t blake2s_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL +}; + static inline int blake2s_set_lastnode(blake2s_state * S) { S->f[1] = ~0U; @@ -70,8 +677,15 @@ static inline int blake2s_increment_counter(blake2s_state * S, const uint32_t inc) { +#ifdef XXXSSE + uint64_t t = ((uint64_t) S->t[1] << 32) | S->t[0]; + t += inc; + S->t[0] = (uint32_t) (t >> 0); + S->t[1] = (uint32_t) (t >> 32); +#else /* XXXSSE */ S->t[0] += inc; S->t[1] += (S->t[0] < inc); +#endif /* XXXSSE */ return 0; } @@ -159,6 +773,18 @@ /* init2 xors IV with input parameter block */ int blake2s_init_param(blake2s_state * S, const blake2s_param * P) { +#ifdef XXXSSE + uint8_t *p, *h, *v; + //blake2s_init0( S ); + v = (uint8_t *) (blake2s_IV); + h = (uint8_t *) (S->h); + p = (uint8_t *) (P); + /* IV XOR ParamBlock */ + memset(S, 0, sizeof(blake2s_state)); + + for (int i = 0; i < BLAKE2S_OUTBYTES; ++i) + h[i] = v[i] ^ p[i]; +#else /* XXXSSE */ blake2s_init0(S); uint32_t *p = (uint32_t *) (P); size_t i; @@ -166,6 +792,7 @@ /* IV XOR ParamBlock */ for (i = 0; i < 8; ++i) S->h[i] ^= load32(&p[i]); +#endif /* XXXSSE */ return 0; } @@ -180,6 +807,22 @@ if (outlen == 0 || outlen > BLAKE2S_OUTBYTES) return -1; +#ifdef XXXSSE + { const blake2s_param _P = { + .digest_length = outlen, + .key_length = 0, + .fanout = 1, + .depth = 1, + .leaf_length = 0, + .node_offset = {0}, + .node_depth = 0, + .inner_length = 0, + .salt = {0}, + .personal = {0} + }; + P[0] = *(blake2s_param *)&_P; /* structure assignment */ + } +#else /* XXXSSE */ P->digest_length = outlen; P->key_length = 0; P->fanout = 1; @@ -191,12 +834,15 @@ // memset(P->reserved, 0, sizeof(P->reserved) ); memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); +#endif /* XXXSSE */ + return blake2s_init_param(S, P); } int blake2s_init_key(blake2s_state * S, const uint8_t outlen, const void *key, const uint8_t keylen) { + /* Move interval verification here? */ blake2s_param P[1]; if (outlen == 0 || outlen > BLAKE2S_OUTBYTES) @@ -205,6 +851,22 @@ if (!key || !keylen || keylen > BLAKE2S_KEYBYTES) return -1; +#ifdef XXXSSE + { const blake2s_param _P = { + .digest_length = outlen, + .key_length = keylen, + .fanout = 1, + .depth = 1, + .leaf_length = 0, + .node_offset = {0}, + .node_depth = 0, + .inner_length = 0, + .salt = {0}, + .personal = {0} + }; + P[0] = *(blake2s_param *)&_P; /* structure assignment */ + } +#else /* XXXSSE */ P->digest_length = outlen; P->key_length = keylen; P->fanout = 1; @@ -216,6 +878,7 @@ // memset(P->reserved, 0, sizeof(P->reserved) ); memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); +#endif /* XXXSSE */ if (blake2s_init_param(S, P) < 0) return -1; @@ -230,9 +893,58 @@ return 0; } -static int blake2s_compress(blake2s_state * S, +static inline int blake2s_compress(blake2s_state * S, const uint8_t block[BLAKE2S_BLOCKBYTES]) { +#ifdef XXXSSE + __m128i row1, row2, row3, row4; + __m128i buf1, buf2, buf3, buf4; +#if defined(HAVE_SSE41) + __m128i t0, t1; +#if !defined(HAVE_XOP) + __m128i t2; +#endif +#endif + __m128i ff0, ff1; +#if defined(HAVE_SSSE3) && !defined(HAVE_XOP) + const __m128i r8 = + _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1); + const __m128i r16 = + _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2); +#endif +#if defined(HAVE_SSE41) + const __m128i m0 = LOADU(block + 00); + const __m128i m1 = LOADU(block + 16); + const __m128i m2 = LOADU(block + 32); + const __m128i m3 = LOADU(block + 48); +#else + const uint32_t m0 = ((uint32_t *) block)[0]; + const uint32_t m1 = ((uint32_t *) block)[1]; + const uint32_t m2 = ((uint32_t *) block)[2]; + const uint32_t m3 = ((uint32_t *) block)[3]; + const uint32_t m4 = ((uint32_t *) block)[4]; + const uint32_t m5 = ((uint32_t *) block)[5]; + const uint32_t m6 = ((uint32_t *) block)[6]; + const uint32_t m7 = ((uint32_t *) block)[7]; + const uint32_t m8 = ((uint32_t *) block)[8]; + const uint32_t m9 = ((uint32_t *) block)[9]; + const uint32_t m10 = ((uint32_t *) block)[10]; + const uint32_t m11 = ((uint32_t *) block)[11]; + const uint32_t m12 = ((uint32_t *) block)[12]; + const uint32_t m13 = ((uint32_t *) block)[13]; + const uint32_t m14 = ((uint32_t *) block)[14]; + const uint32_t m15 = ((uint32_t *) block)[15]; +#endif + row1 = ff0 = LOADU(&S->h[0]); + row2 = ff1 = LOADU(&S->h[4]); + row3 = _mm_setr_epi32(0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A); + row4 = + _mm_xor_si128(_mm_setr_epi32 + (0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19), + LOADU(&S->t[0])); + +#else /* XXXSSE */ + uint32_t m[16]; uint32_t v[16]; size_t i; @@ -251,28 +963,8 @@ v[13] = S->t[1] ^ blake2s_IV[5]; v[14] = S->f[0] ^ blake2s_IV[6]; v[15] = S->f[1] ^ blake2s_IV[7]; -#define G(r,i,a,b,c,d) \ - do { \ - a = a + b + m[blake2s_sigma[r][2*i+0]]; \ - d = rotr32(d ^ a, 16); \ - c = c + d; \ - b = rotr32(b ^ c, 12); \ - a = a + b + m[blake2s_sigma[r][2*i+1]]; \ - d = rotr32(d ^ a, 8); \ - c = c + d; \ - b = rotr32(b ^ c, 7); \ - } while(0) -#define ROUND(r) \ - do { \ - G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ - G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ - G(r,2,v[ 2],v[ 6],v[10],v[14]); \ - G(r,3,v[ 3],v[ 7],v[11],v[15]); \ - G(r,4,v[ 0],v[ 5],v[10],v[15]); \ - G(r,5,v[ 1],v[ 6],v[11],v[12]); \ - G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ - G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ - } while(0) +#endif /* XXXSSE */ + ROUND(0); ROUND(1); ROUND(2); @@ -284,11 +976,14 @@ ROUND(8); ROUND(9); +#ifdef XXXSSE + STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); + STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); +#else for (i = 0; i < 8; ++i) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; +#endif -#undef G -#undef ROUND return 0; } @@ -344,6 +1039,7 @@ return 0; } +/* inlen, at least, should be uint64_t. Others can be size_t. */ int blake2s(uint8_t * out, const void *in, const void *key, const uint8_t outlen, const uint64_t inlen, uint8_t keylen) { @@ -372,6 +1068,14 @@ return 0; } +#if defined(SUPERCOP) +int crypto_hash(unsigned char *out, unsigned char *in, + unsigned long long inlen) +{ + return blake2s(out, in, NULL, BLAKE2S_OUTBYTES, inlen, 0); +} +#endif + #if defined(BLAKE2S_SELFTEST) #include "blake2-kat.h" int main(int argc, char **argv) @@ -388,9 +1092,9 @@ for (i = 0; i < KAT_LENGTH; ++i) { uint8_t hash[BLAKE2S_OUTBYTES]; - blake2s(hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES); - if (memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) { + if (blake2s(hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES) < 0 + || memcmp(hash, blake2s_keyed_kat[i], BLAKE2S_OUTBYTES)) { puts("error"); return -1; } @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/blake2sp.c ============================================================================ $ cvs diff -u -r1.1.2.4 -r1.1.2.5 blake2sp.c --- rpm/rpmio/blake2sp.c 27 Mar 2014 19:14:04 -0000 1.1.2.4 +++ rpm/rpmio/blake2sp.c 9 Oct 2014 02:40:41 -0000 1.1.2.5 @@ -35,7 +35,7 @@ store32(&P->leaf_length, 0); store48(P->node_offset, offset); P->node_depth = 0; - P->inner_length = outlen; + P->inner_length = BLAKE2S_OUTBYTES; memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); return blake2s_init_param(S, P); @@ -52,7 +52,7 @@ store32(&P->leaf_length, 0); store48(P->node_offset, 0ULL); P->node_depth = 1; - P->inner_length = outlen; + P->inner_length = BLAKE2S_OUTBYTES; memset(P->salt, 0, sizeof(P->salt)); memset(P->personal, 0, sizeof(P->personal)); return blake2s_init_param(S, P); @@ -289,9 +289,9 @@ for (i = 0; i < KAT_LENGTH; ++i) { uint8_t hash[BLAKE2S_OUTBYTES]; - blake2sp(hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES); - if (memcmp(hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES)) { + if (blake2sp(hash, buf, key, BLAKE2S_OUTBYTES, i, BLAKE2S_KEYBYTES) < 0 + || memcmp(hash, blake2sp_keyed_kat[i], BLAKE2S_OUTBYTES)) { puts("error"); return -1; } @@ . patch -p0 <<'@@ .' Index: rpm/rpmio/tgit.c ============================================================================ $ cvs diff -u -r1.1.2.44 -r1.1.2.45 tgit.c --- rpm/rpmio/tgit.c 7 Oct 2014 18:16:56 -0000 1.1.2.44 +++ rpm/rpmio/tgit.c 9 Oct 2014 02:40:41 -0000 1.1.2.45 @@ -59,13 +59,13 @@ FILE * fp = stdout; if (pd->total_steps > 0) { - fprintf(fp, "\rCheckouts: %d/%d", pd->completed_steps, pd->total_steps); + fprintf(fp, "\rCheckouts: %u/%u", (unsigned)pd->completed_steps, (unsigned)pd->total_steps); } else if (pd->fetch_progress.total_deltas > 0 && pd->fetch_progress.received_objects == pd->fetch_progress.total_objects) { - fprintf(fp, "\rResolving deltas %d/%d", - pd->fetch_progress.indexed_deltas, - pd->fetch_progress.total_deltas); + fprintf(fp, "\rResolving deltas %u/%u", + (unsigned)pd->fetch_progress.indexed_deltas, + (unsigned)pd->fetch_progress.total_deltas); if (pd->fetch_progress.indexed_deltas == pd->fetch_progress.total_deltas) fprintf(fp, ", done.\n"); } else { @@ -79,10 +79,11 @@ Receiving objects: 100% (53569/53569), 20.52 MiB | 5.24 MiB/s, done. Resolving deltas: 100% (36892/36892), done. #endif - fprintf(fp, "\rReceiving objects: %3d%% (%d/%d) indexed (%d), %6.2f MiB", + fprintf(fp, "\rReceiving objects: %3d%% (%u/%u) indexed (%u), %6.2f MiB", network_percent, - pd->fetch_progress.received_objects, pd->fetch_progress.total_objects, - pd->fetch_progress.indexed_objects, + (unsigned)pd->fetch_progress.received_objects, + (unsigned)pd->fetch_progress.total_objects, + (unsigned)pd->fetch_progress.indexed_objects, (pd->fetch_progress.received_bytes / (1024. * 1024.)) ); if (pd->fetch_progress.received_objects == pd->fetch_progress.total_objects) fprintf(fp, ", done.\n"); @@ . ______________________________________________________________________ RPM Package Manager http://rpm5.org CVS Sources Repository rpm-cvs@rpm5.org