>>>> Oops! Wrong patch! Correct one attached. If you feel like testing the
>>>> wrong one, go ahead, but there are some later non-essential adjustments.
>>>>
>>>> diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
>>>> index bf3fcc6..33b07ce 100644
>>>> --- a/crypto/ec/ecp_nistz256.c
>>>> +++ b/crypto/ec/ecp_nistz256.c
>>>> @@ -637,7 +637,7 @@ static void ecp_nistz256_windowed_mul(const EC_GROUP * 
>>>> group,
>>>>          ecp_nistz256_point_double(&row[10 - 1], &row[ 5 - 1]);
>>>>          ecp_nistz256_point_add   (&row[15 - 1], &row[14 - 1], &row[1 - 
>>>> 1]);
>>>>          ecp_nistz256_point_add   (&row[11 - 1], &row[10 - 1], &row[1 - 
>>>> 1]);
>>>> -        ecp_nistz256_point_add   (&row[16 - 1], &row[15 - 1], &row[1 - 
>>>> 1]);
>>>> +        ecp_nistz256_point_double(&row[16 - 1], &row[ 8 - 1]);
>>>>      }
>>>>
>>>>      index = 255;
>>> I can believe that this fixes the issue, but it's just masking it, no?
> 
> The underlying problem is that assembly routines return "partially
> reduced" results. "Partially reduced" means that it can return result +
> modulus if it fits in 256 bits. Rationale is that
> ((x+m)*y)%m=(x*y+m*y)%m=x*y%m+m*y%m and last term is 0. While it does
> work with series of multiplications, I failed to recognize that there
> are corner cases in non-multiplication operations. I'm preparing an
> update...

Attached. A little bit worse performance on some CPUs. I also took
opportunity to harmonize ecp_nistz256_from_mont by applying same pattern
for reduction. The patch is cumulative, i.e. is not incremental to
previously posted one[s], and addresses both problems, originally
reported one and discovered in the course. Patch to ecp_nistz256.c
referred above doesn't matter.


diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 4486a5e..cdff22a 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -31,15 +31,16 @@
 # Further optimization by <ap...@openssl.org>:
 #
 #		this/original
-# Opteron	+8-33%
-# Bulldozer	+10-30%
-# P4		+14-38%
-# Westmere	+8-23%
-# Sandy Bridge	+8-24%
-# Ivy Bridge	+7-25%
-# Haswell	+5-25%
-# Atom		+10-32%
-# VIA Nano	+37-130%
+# Opteron	+12-49%
+# Bulldozer	+14-45%
+# P4		+18-46%
+# Westmere	+12-34%
+# Sandy Bridge	+9-35%
+# Ivy Bridge	+9-35%
+# Haswell	+8-37%
+# Broadwell	+18-58%
+# Atom		+15-50%
+# VIA Nano	+43-160%
 #
 # Ranges denote minimum and maximum improvement coefficients depending
 # on benchmark. Lower coefficients are for ECDSA sign, relatively
@@ -550,28 +551,20 @@ __ecp_nistz256_mul_montq:
 	# and add the result to the acc.
 	# Due to the special form of p256 we do some optimizations
 	#
-	# acc[0] x p256[0] = acc[0] x 2^64 - acc[0]
-	# then we add acc[0] and get acc[0] x 2^64
-
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc0, $acc1		# +=acc[0]*2^64
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-
-	# acc[0] x p256[2] = 0
-	adc	%rdx, $acc2
-	adc	\$0, $t0
+	# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
+	# then we add acc[0] and get acc[0] x 2^96
 
+	mov	$acc0, $t1
+	shl	\$32, $acc0
 	mulq	$poly3
-	xor	$acc0, $acc0
-	add	$t0, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
+	shr	\$32, $t1
+	add	$acc0, $acc1		# +=acc[0]<<96
+	adc	$t1, $acc2
+	adc	%rax, $acc3
 	 mov	8*1($b_ptr), %rax
 	adc	%rdx, $acc4
 	adc	\$0, $acc5
+	xor	$acc0, $acc0
 
 	########################################################################
 	# Multiply by b[1]
@@ -608,23 +601,17 @@ __ecp_nistz256_mul_montq:
 
 	########################################################################
 	# Second reduction step	
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3
-	adc	\$0, $t0
-
+	mov	$acc1, $t1
+	shl	\$32, $acc1
 	mulq	$poly3
-	xor	$acc1, $acc1
-	add	$t0, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
+	shr	\$32, $t1
+	add	$acc1, $acc2
+	adc	$t1, $acc3
+	adc	%rax, $acc4
 	 mov	8*2($b_ptr), %rax
 	adc	%rdx, $acc5
 	adc	\$0, $acc0
+	xor	$acc1, $acc1
 
 	########################################################################
 	# Multiply by b[2]
@@ -661,23 +648,17 @@ __ecp_nistz256_mul_montq:
 
 	########################################################################
 	# Third reduction step	
-	mulq	$poly1
-	xor	$t0, $t0
-	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $t0
-
+	mov	$acc2, $t1
+	shl	\$32, $acc2
 	mulq	$poly3
-	xor	$acc2, $acc2
-	add	$t0, $acc5
-	adc	\$0, %rdx
-	add	%rax, $acc5
+	shr	\$32, $t1
+	add	$acc2, $acc3
+	adc	$t1, $acc4
+	adc	%rax, $acc5
 	 mov	8*3($b_ptr), %rax
 	adc	%rdx, $acc0
 	adc	\$0, $acc1
+	xor	$acc2, $acc2
 
 	########################################################################
 	# Multiply by b[3]
@@ -714,20 +695,14 @@ __ecp_nistz256_mul_montq:
 
 	########################################################################
 	# Final reduction step	
-	mulq	$poly1
-	#xor	$t0, $t0
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc5
-	#adc	\$0, $t0		# doesn't overflow
-
+	mov	$acc3, $t1
+	shl	\$32, $acc3
 	mulq	$poly3
-	#add	$t0, $acc0
-	#adc	\$0, %rdx
+	shr	\$32, $t1
+	add	$acc3, $acc4
+	adc	$t1, $acc5
 	 mov	$acc4, $t0
-	add	%rax, $acc0
+	adc	%rax, $acc0
 	adc	%rdx, $acc1
 	 mov	$acc5, $t1
 	adc	\$0, $acc2
@@ -740,14 +715,14 @@ __ecp_nistz256_mul_montq:
 	sbb	\$0, $acc0		# .Lpoly[2]
 	 mov	$acc1, $t3
 	sbb	$poly3, $acc1		# .Lpoly[3]
-	neg	$acc2
+	sbb	\$0, $acc2
 
-	cmovnc	$t0, $acc4
-	cmovnc	$t1, $acc5
+	cmovc	$t0, $acc4
+	cmovc	$t1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$t2, $acc0
+	cmovc	$t2, $acc0
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t3, $acc1
+	cmovc	$t3, $acc1
 	mov	$acc0, 8*2($r_ptr)
 	mov	$acc1, 8*3($r_ptr)
 
@@ -897,89 +872,62 @@ __ecp_nistz256_sqr_montq:
 	##########################################
 	# Now the reduction
 	# First iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc0, $acc1
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-	adc	%rdx, $acc2	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc0, $t0
+	shl	\$32, $acc0
 	mulq	$t1
-	xor	$acc0, $acc0
-	#add	$t0, $acc3
-	#adc	\$0, %rdx
-	add	%rax, $acc3
+	shr	\$32, $t0
+	add	$acc0, $acc1		# +=acc[0]<<96
+	adc	$t0, $acc2
+	adc	%rax, $acc3
 	 mov	$acc1, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $acc0
+	adc	\$0, %rdx
 
 	##########################################
 	# Second iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc1, $t0
+	shl	\$32, $acc1
+	mov	%rdx, $acc0
 	mulq	$t1
-	xor	$acc1, $acc1
-	#add	$t0, $acc4
-	#adc	\$0, %rdx
-	add	%rax, $acc4
+	shr	\$32, $t0
+	add	$acc1, $acc2
+	adc	$t0, $acc3
+	adc	%rax, $acc0
 	 mov	$acc2, %rax
-	adc	%rdx, $acc0
-	adc	\$0, $acc1
+	adc	\$0, %rdx
 
 	##########################################
 	# Third iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc2, $t0
+	shl	\$32, $acc2
+	mov	%rdx, $acc1
 	mulq	$t1
-	xor	$acc2, $acc2
-	#add	$t0, $acc0
-	#adc	\$0, %rdx
-	add	%rax, $acc0
+	shr	\$32, $t0
+	add	$acc2, $acc3
+	adc	$t0, $acc0
+	adc	%rax, $acc1
 	 mov	$acc3, %rax
-	adc	%rdx, $acc1
-	adc	\$0, $acc2
+	adc	\$0, %rdx
 
 	###########################################
 	# Last iteration
-	mulq	$a_ptr
-	#xor	$t0, $t0
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc0	# doesn't overflow
-	#adc	\$0, $t0
-
+	mov	$acc3, $t0
+	shl	\$32, $acc3
+	mov	%rdx, $acc2
 	mulq	$t1
+	shr	\$32, $t0
+	add	$acc3, $acc0
+	adc	$t0, $acc1
+	adc	%rax, $acc2
+	adc	\$0, %rdx
 	xor	$acc3, $acc3
-	#add	$t0, $acc1
-	#adc	\$0, %rdx
-	add	%rax, $acc1
-	adc	%rdx, $acc2
-	adc	\$0, $acc3
 
 	############################################
 	# Add the rest of the acc
-	add	$acc0, $acc5
+	add	$acc0, $acc4
+	adc	$acc1, $acc5
 	 mov	$acc4, $acc0
-	adc	$acc1, $acc6
-	adc	$acc2, $acc7
+	adc	$acc2, $acc6
+	adc	%rdx, $acc7
 	 mov	$acc5, $acc1
 	adc	\$0, $acc3
 
@@ -989,14 +937,14 @@ __ecp_nistz256_sqr_montq:
 	sbb	\$0, $acc6		# .Lpoly[2]
 	 mov	$acc7, $t0
 	sbb	$t1, $acc7		# .Lpoly[3]
-	neg	$acc3
+	sbb	\$0, $acc3
 
-	cmovnc	$acc0, $acc4
-	cmovnc	$acc1, $acc5
+	cmovc	$acc0, $acc4
+	cmovc	$acc1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$acc2, $acc6
+	cmovc	$acc2, $acc6
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t0, $acc7
+	cmovc	$t0, $acc7
 	mov	$acc6, 8*2($r_ptr)
 	mov	$acc7, 8*3($r_ptr)
 
@@ -1028,18 +976,15 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# First reduction step
-	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
-	adox	$t1, $acc1
-	adox	$t0, $acc2
+	add	$t1, $acc1
+	adc	$t0, $acc2
 
 	mulx	$poly3, $t0, $t1
 	 mov	8*1($b_ptr), %rdx
-	adox	$t0, $acc3
-	adcx	$t1, $acc4
-
-	adox	$acc0, $acc4
-	adcx	$acc0, $acc5		# cf=0
-	adox	$acc0, $acc5		# of=0
+	adc	$t0, $acc3
+	adc	$t1, $acc4
+	adc	\$0, $acc5
+	xor	$acc0, $acc0		# $acc0=0,cf=0,of=0
 
 	########################################################################
 	# Multiply by b[1]
@@ -1068,18 +1013,15 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# Second reduction step
-	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
-	adox	$t0, $acc2
-	adox	$t1, $acc3
+	add	$t0, $acc2
+	adc	$t1, $acc3
 
 	mulx	$poly3, $t0, $t1
 	 mov	8*2($b_ptr), %rdx
-	adox	$t0, $acc4
-	adcx	$t1, $acc5
-
-	adox	$acc1, $acc5
-	adcx	$acc1, $acc0		# cf=0
-	adox	$acc1, $acc0		# of=0
+	adc	$t0, $acc4
+	adc	$t1, $acc5
+	adc	\$0, $acc0
+	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
 
 	########################################################################
 	# Multiply by b[2]
@@ -1108,18 +1050,15 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# Third reduction step
-	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
-	adox	$t0, $acc3
-	adox	$t1, $acc4
+	add	$t0, $acc3
+	adc	$t1, $acc4
 
 	mulx	$poly3, $t0, $t1
 	 mov	8*3($b_ptr), %rdx
-	adox	$t0, $acc5
-	adcx	$t1, $acc0
-
-	adox	$acc2, $acc0
-	adcx	$acc2, $acc1		# cf=0
-	adox	$acc2, $acc1		# of=0
+	adc	$t0, $acc5
+	adc	$t1, $acc0
+	adc	\$0, $acc1
+	xor	$acc2, $acc2		# $acc2=0,cf=0,of=0
 
 	########################################################################
 	# Multiply by b[3]
@@ -1148,38 +1087,34 @@ __ecp_nistz256_mul_montx:
 
 	########################################################################
 	# Fourth reduction step
-	xor	$acc3, $acc3		# $acc3=0,cf=0,of=0
-	adox	$t0, $acc4
-	adox	$t1, $acc5
+	add	$t0, $acc4
+	adc	$t1, $acc5
 
 	mulx	$poly3, $t0, $t1
 	 mov	$acc4, $t2
 	mov	.Lpoly+8*1(%rip), $poly1
-	adcx	$t0, $acc0
-	adox	$t1, $acc1
+	adc	$t0, $acc0
 	 mov	$acc5, $t3
-
-	adcx	$acc3, $acc1
-	adox	$acc3, $acc2
+	adc	$t1, $acc1
 	adc	\$0, $acc2
-	 mov	$acc0, $t0
 
 	########################################################################
 	# Branch-less conditional subtraction of P
 	xor	%eax, %eax
+	 mov	$acc0, $t0
 	sbb	\$-1, $acc4		# .Lpoly[0]
 	sbb	$poly1, $acc5		# .Lpoly[1]
 	sbb	\$0, $acc0		# .Lpoly[2]
 	 mov	$acc1, $t1
 	sbb	$poly3, $acc1		# .Lpoly[3]
+	sbb	\$0, $acc2
 
-	bt	\$0,$acc2
-	cmovnc	$t2, $acc4
-	cmovnc	$t3, $acc5
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$t0, $acc0
+	cmovc	$t0, $acc0
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$t1, $acc1
+	cmovc	$t1, $acc1
 	mov	$acc0, 8*2($r_ptr)
 	mov	$acc1, 8*3($r_ptr)
 
@@ -1247,52 +1182,44 @@ __ecp_nistz256_sqr_montx:
 	 mov	.Lpoly+8*3(%rip), $t1
 
 	# reduction step 1
-	xor	$acc0, $acc0
-	adcx	$t0, $acc1
-	adcx	$t4, $acc2
+	add	$t0, $acc1
+	adc	$t4, $acc2
 
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc0
 	 mov	$acc1, %rdx
-	adcx	$t0, $acc3
+	adc	$t0, $acc3
 	 shlx	$a_ptr, $acc1, $t0
-	adox	$t4, $acc0
-	 shrx	$a_ptr, $acc1, $t4
 	adc	\$0, $acc0
+	 shrx	$a_ptr, $acc1, $t4
 
 	# reduction step 2
-	xor	$acc1, $acc1
-	adcx	$t0, $acc2
-	adcx	$t4, $acc3
+	add	$t0, $acc2
+	adc	$t4, $acc3
 
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc1
 	 mov	$acc2, %rdx
-	adcx	$t0, $acc0
+	adc	$t0, $acc0
 	 shlx	$a_ptr, $acc2, $t0
-	adox	$t4, $acc1
-	 shrx	$a_ptr, $acc2, $t4
 	adc	\$0, $acc1
+	 shrx	$a_ptr, $acc2, $t4
 
 	# reduction step 3
-	xor	$acc2, $acc2
-	adcx	$t0, $acc3
-	adcx	$t4, $acc0
+	add	$t0, $acc3
+	adc	$t4, $acc0
 
-	mulx	$t1, $t0, $t4
+	mulx	$t1, $t0, $acc2
 	 mov	$acc3, %rdx
-	adcx	$t0, $acc1
+	adc	$t0, $acc1
 	 shlx	$a_ptr, $acc3, $t0
-	adox	$t4, $acc2
-	 shrx	$a_ptr, $acc3, $t4
 	adc	\$0, $acc2
+	 shrx	$a_ptr, $acc3, $t4
 
 	# reduction step 4
-	xor	$acc3, $acc3
-	adcx	$t0, $acc0
-	adcx	$t4, $acc1
+	add	$t0, $acc0
+	adc	$t4, $acc1
 
-	mulx	$t1, $t0, $t4
-	adcx	$t0, $acc2
-	adox	$t4, $acc3
+	mulx	$t1, $t0, $acc3
+	adc	$t0, $acc2
 	adc	\$0, $acc3
 
 	xor	$t3, $t3		# cf=0
@@ -1312,14 +1239,14 @@ __ecp_nistz256_sqr_montx:
 	sbb	\$0, $acc6		# .Lpoly[2]
 	 mov	$acc7, $acc3
 	sbb	$t1, $acc7		# .Lpoly[3]
+	sbb	\$0, $t3
 
-	bt	\$0,$t3
-	cmovnc	$acc0, $acc4
-	cmovnc	$acc1, $acc5
+	cmovc	$acc0, $acc4
+	cmovc	$acc1, $acc5
 	mov	$acc4, 8*0($r_ptr)
-	cmovnc	$acc2, $acc6
+	cmovc	$acc2, $acc6
 	mov	$acc5, 8*1($r_ptr)
-	cmovnc	$acc3, $acc7
+	cmovc	$acc3, $acc7
 	mov	$acc6, 8*2($r_ptr)
 	mov	$acc7, 8*3($r_ptr)
 
@@ -1330,8 +1257,8 @@ ___
 }
 {
 my ($r_ptr,$in_ptr)=("%rdi","%rsi");
-my ($acc0,$acc1,$acc2,$acc3,$acc4)=map("%r$_",(8..12));
-my ($t0,$t1)=("%rcx","%rsi");
+my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
+my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
 
 $code.=<<___;
 ################################################################################
@@ -1348,109 +1275,83 @@ ecp_nistz256_from_mont:
 	push	%r13
 
 	mov	8*0($in_ptr), %rax
+	mov	.Lpoly+8*3(%rip), $t2
 	mov	8*1($in_ptr), $acc1
 	mov	8*2($in_ptr), $acc2
 	mov	8*3($in_ptr), $acc3
-	lea	.Lpoly(%rip), $in_ptr
-	xor	$acc4, $acc4
 	mov	%rax, $acc0
+	mov	.Lpoly+8*1(%rip), $t1
 
 	#########################################
 	# First iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
+	mov	%rax, $t0
+	shl	\$32, $acc0
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc0, $acc1
-	adc	\$0, %rdx
-	add	%rax, $acc1
-	mov	$acc0, %rax
-	adc	%rdx, $acc2
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	xor	$acc0, $acc0
-	add	$t0, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
+	adc	$t0, $acc2
+	adc	%rax, $acc3
 	 mov	$acc1, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $acc0
+	adc	\$0, %rdx
 
 	#########################################
 	# Second iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
+	mov	$acc1, $t0
+	shl	\$32, $acc1
+	mov	%rdx, $acc0
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc1, $acc2
-	adc	\$0, %rdx
-	add	%rax, $acc2
-	mov	$acc1, %rax
-	adc	%rdx, $acc3
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	xor	$acc1, $acc1
-	add	$t0, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
+	adc	$t0, $acc3
+	adc	%rax, $acc0
 	 mov	$acc2, %rax
-	adc	%rdx, $acc0
-	adc	\$0, $acc1
+	adc	\$0, %rdx
 
 	##########################################
 	# Third iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
+	mov	$acc2, $t0
+	shl	\$32, $acc2
+	mov	%rdx, $acc1
+	mulq	$t2
+	shr	\$32, $t0
 	add	$acc2, $acc3
-	adc	\$0, %rdx
-	add	%rax, $acc3
-	mov	$acc2, %rax
-	adc	%rdx, $acc4
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	xor	$acc2, $acc2
-	add	$t0, $acc0
-	adc	\$0, %rdx
-	add	%rax, $acc0
+	adc	$t0, $acc0
+	adc	%rax, $acc1
 	 mov	$acc3, %rax
-	adc	%rdx, $acc1
-	adc	\$0, $acc2
+	adc	\$0, %rdx
 
 	###########################################
 	# Last iteration
-	mulq	1*8($in_ptr)
-	xor	$t0, $t0
-	add	$acc3, $acc4
-	adc	\$0, %rdx
-	add	%rax, $acc4
-	mov	$acc3, %rax
-	adc	%rdx, $acc0
-	adc	\$0, $t0
-
-	mulq	3*8($in_ptr)
-	add	$t0, $acc1
+	mov	$acc3, $t0
+	shl	\$32, $acc3
+	mov	%rdx, $acc2
+	mulq	$t2
+	shr	\$32, $t0
+	add	$acc3, $acc0
+	adc	$t0, $acc1
+	 mov	$acc0, $t0
+	adc	%rax, $acc2
+	 mov	$acc1, $in_ptr
 	adc	\$0, %rdx
-	add	%rax, $acc1
-	adc	%rdx, $acc2
-	sbb	$acc3, $acc3
 
-	mov	0*8($in_ptr), %rax
-	mov	1*8($in_ptr), %rdx
-	mov	2*8($in_ptr), $t0
-	mov	3*8($in_ptr), $t1
-
-	and	$acc3, %rax
-	and	$acc3, %rdx
-	and	$acc3, $t0
-	and	$acc3, $t1
-
-	sub	%rax, $acc4
-	sbb	%rdx, $acc0
-	mov	$acc4, 8*0($r_ptr)
-	sbb	$t0, $acc1
-	mov	$acc0, 8*1($r_ptr)
-	sbb	$t1, $acc2
-	mov	$acc1, 8*2($r_ptr)
-	mov	$acc2, 8*3($r_ptr)
+	###########################################
+	# Branch-less conditional subtraction
+	sub	\$-1, $acc0
+	 mov	$acc2, %rax
+	sbb	$t1, $acc1
+	sbb	\$0, $acc2
+	 mov	%rdx, $acc3
+	sbb	$t2, %rdx
+	sbb	$t2, $t2
+
+	cmovnz	$t0, $acc0
+	cmovnz	$in_ptr, $acc1
+	mov	$acc0, 8*0($r_ptr)
+	cmovnz	%rax, $acc2
+	mov	$acc1, 8*1($r_ptr)
+	cmovz	%rdx, $acc3
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
 
 	pop	%r13
 	pop	%r12

Reply via email to