On 05/28/2015 08:51 AM, Andrew Haley wrote: > On 27/05/15 21:35, Anthony Scarpino wrote: > >> How much of the montgomery multiply and sqaure are you planning to >> intrinsify? Are you doing the whole thing or just portions of the >> operations similar to multiplyToLen and squareToLen? > > I'm doing the whole montgomery multiply and square operation in a > routine which interleaves multiplication and reduction so that the > intermediate product is never written into memory.
For a bit more information, here's the algorithm in C: Andrew. // Multiply (unsigned) Long A by Long B, accumulating the double- // length result into the accumulator formed of T0, T1, and T2. #define MACC(A, B, T0, T1, T2) \ do { \ unsigned long hi, lo; \ __asm__ ("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ : "r"(A), "a"(B)); \ } while(0) // Fast Montgomery multiplication. The derivation of the algorithm is // in A Cryptographic Library for the Motorola DSP56000, // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. static void __attribute__((noinline)) montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], unsigned long m[], unsigned long inv, int len) { unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; assert(inv * n[0] == -1UL); for (i = 0; i < len; i++) { int j; for (j = 0; j < i; j++) { MACC(a[j], b[i-j], t0, t1, t2); MACC(m[j], n[i-j], t0, t1, t2); } MACC(a[i], b[0], t0, t1, t2); m[i] = t0 * inv; MACC(m[i], n[0], t0, t1, t2); assert(t0 == 0); t0 = t1; t1 = t2; t2 = 0; } for (i = len; i < 2*len; i++) { int j; for (j = i-len+1; j < len; j++) { MACC(a[j], b[i-j], t0, t1, t2); MACC(m[j], n[i-j], t0, t1, t2); } m[i-len] = t0; t0 = t1; t1 = t2; t2 = 0; } m[len] = t0; while(cmp(m, n, len+1) > 0) sub(m, n, len+1); }