diff --git a/crypto/ec/asm/ecp_nistz256-x86_64.pl b/crypto/ec/asm/ecp_nistz256-x86_64.pl
index 86ee382..30de8b7 100755
--- a/crypto/ec/asm/ecp_nistz256-x86_64.pl
+++ b/crypto/ec/asm/ecp_nistz256-x86_64.pl
@@ -2,7 +2,13 @@
 
 ##############################################################################
 #                                                                            #
-# Copyright 2014 Intel Corporation                                           #
+# Copyright (c) 2015 Intel Corporation                                       #
+# Copyright (c) 2015 CloudFlare, Inc.                                        #
+# All rights reserved.                                                       #
+#                                                                            #
+# This software is dual licensed under the Apache V.2.0 and BSD licenses     #
+#                                                                            #
+##############################################################################
 #                                                                            #
 # Licensed under the Apache License, Version 2.0 (the "License");            #
 # you may not use this file except in compliance with the License.           #
@@ -18,10 +24,41 @@
 #                                                                            #
 ##############################################################################
 #                                                                            #
+#  Redistribution and use in source and binary forms, with or without        #
+#  modification, are permitted provided that the following conditions are    #
+#  met:                                                                      #
+#                                                                            #
+#  #  Redistributions of source code must retain the above copyright         #
+#     notice, this list of conditions and the following disclaimer.          #
+#                                                                            #
+#  #  Redistributions in binary form must reproduce the above copyright      #
+#     notice, this list of conditions and the following disclaimer in the    #
+#     documentation and/or other materials provided with the                 #
+#     distribution.                                                          #
+#                                                                            #
+#  #  Neither the name of the copyright holders nor the names of its         #
+#     contributors may be used to endorse or promote products derived from   #
+#     this software without specific prior written permission.               #
+#                                                                            #
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS       #
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED #
+#  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR#
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR         #
+#  CONTRIBUTORS  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,    #
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
+#                                                                            #
+##############################################################################                                                                           
+#                                                                            #
 #  Developers and authors:                                                   #
-#  Shay Gueron (1, 2), and Vlad Krasnov (1)                                  #
+#  Shay Gueron (1, 2), and Vlad Krasnov (1, 3)                               #
 #  (1) Intel Corporation, Israel Development Center                          #
 #  (2) University of Haifa                                                   #
+#  (3) CloudFlare, Inc.                                                      #
 #  Reference:                                                                #
 #  S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with#
 #                           256 Bit Primes"                                  #
@@ -108,6 +145,13 @@ $code.=<<___;
 .long 3,3,3,3,3,3,3,3
 .LONE_mont:
 .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
+
+# Constants for computations modulo ord(p256)
+.align 64
+.Lord:
+.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000
+.LordK:
+.quad 0xccd1c8aaee00bc4f
 ___
 
 {
@@ -435,6 +479,981 @@ my ($poly1,$poly3)=($acc6,$acc7);
 
 $code.=<<___;
 ################################################################################
+# void ecp_nistz256_ord_mul_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   uint64_t b[4]);
+
+.globl	ecp_nistz256_ord_mul_mont
+.type	ecp_nistz256_ord_mul_mont,\@function,3
+.align	32
+ecp_nistz256_ord_mul_mont:
+___
+$code.=<<___	if ($addx);
+	mov	\$0x80100, %ecx
+	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	cmp	\$0x80100, %ecx
+	je	ecp_nistz256_ord_mul_montx
+___
+$code.=<<___;
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+
+	mov	$b_org, $b_ptr
+	# * b[0]
+	mov	8*0($b_ptr), $t0
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	mov	$t4, $acc0
+	mov	$t3, $acc1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $acc2
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $acc3
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $acc4
+	xor	$acc5, $acc5
+
+	# First reduction step
+	mov	$acc0, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc0
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc1
+	adc	\$0, $t3
+	add	$t4, $acc1
+
+	mov	$t0, $t1
+	adc	$t3, $acc2
+	adc	\$0, $t1
+	sub	$t0, $acc2
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+	adc	$t3, $acc4
+	adc	\$0, $acc5
+
+	# * b[1]
+	mov	8*1($b_ptr), $t0
+
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc2
+	adc	\$0, $t3
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	$t3, $acc5
+	adc	\$0, $acc0
+	# Second reduction step
+	mov	$acc1, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc2
+	adc	\$0, $t3
+	add	$t4, $acc2
+
+	mov	$t0, $t1
+	adc	$t3, $acc3
+	adc	\$0, $t1
+	sub	$t0, $acc3
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	$t3, $acc5
+	adc	\$0, $acc0
+	# * b[2]
+	mov	8*2($b_ptr), $t0
+
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc5
+	adc	\$0, $t3
+	add	$t4, $acc5
+	adc	$t3, $acc0
+	adc	\$0, $acc1
+	# Third reduction step
+	mov	$acc2, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+
+	mov	$t0, $t1
+	adc	$t3, $acc4
+	adc	\$0, $t1
+	sub	$t0, $acc4
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc5
+	adc	\$0, $t3
+	add	$t4, $acc5
+	adc	$t3, $acc0
+	adc	\$0, $acc1
+	# * b[3]
+	mov	8*3($b_ptr), $t0
+
+	mov	8*0($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc5
+	adc	\$0, $t3
+	add	$t4, $acc5
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc0
+	adc	\$0, $t3
+	add	$t4, $acc0
+	adc	$t3, $acc1
+	adc	\$0, $acc2
+	# Last reduction step
+	mov	$acc3, $t4
+	mulq	.LordK(%rip)
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+
+	mov	$t0, $t1
+	adc	$t3, $acc5
+	adc	\$0, $t1
+	sub	$t0, $acc5
+	sbb	\$0, $t1
+
+	mov	8*3+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc0
+	adc	\$0, $t3
+	add	$t4, $acc0
+	adc	$t3, $acc1
+	adc	\$0, $acc2
+
+	# Copy result [255:0]
+	mov	$acc4, $a_ptr
+	mov	$acc5, $acc3
+	mov	$acc0, $t0
+	mov	$acc1, $t1
+	# Subtract ord
+	sub	8*0+.Lord(%rip), $acc4
+	sbb	8*1+.Lord(%rip), $acc5
+	sbb	8*2+.Lord(%rip), $acc0
+	sbb	8*3+.Lord(%rip), $acc1
+	sbb	\$0, $acc2
+
+	cmovc	$a_ptr, $acc4
+	cmovc	$acc3, $acc5
+	cmovc	$t0, $acc0
+	cmovc	$t1, $acc1
+
+	mov	$acc4, 8*0($r_ptr)
+	mov	$acc5, 8*1($r_ptr)
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+.size	ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont
+___
+$code.=<<___	if ($addx);
+################################################################################
+.align	32
+ecp_nistz256_ord_mul_montx:
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	$b_org, $b_ptr
+	mov	8*0($b_org), %rdx
+	mov	8*0($a_ptr), $acc1
+	mov	8*1($a_ptr), $acc2
+	mov	8*2($a_ptr), $acc3
+	mov	8*3($a_ptr), $acc4
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+
+	# Multiply by b[0]
+	mulx	$acc1, $acc0, $acc1
+	mulx	$acc2, $t0, $acc2
+	xor	$acc5, $acc5		# cf=0
+	mulx	$acc3, $t1, $acc3
+	adc	$t0, $acc1
+	mulx	$acc4, $t0, $acc4
+	 mov	$acc0, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adc	$t1, $acc2
+	adc	$t0, $acc3
+	adc	\$0, $acc4
+
+	########################################################################
+	xor %eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mov	8*1($b_ptr), %rdx
+	adcx	%rax, $acc4
+	adox	%rax, $acc5
+	adc	\$0, $acc5
+	xor	$acc0 ,$acc0
+	########################################################################
+	# Multiply by b[1]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc1, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	adcx	$acc0, $acc5
+	adox	$acc0, $acc0
+	adc	\$0, $acc0
+	########################################################################
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	mov	8*2($b_ptr), %rdx
+	adcx	%rax, $acc5
+	adox	%rax, $acc0
+	adc	\$0, $acc0
+	xor	$acc1 ,$acc1		# $acc1=0,cf=0,of=0
+	########################################################################
+	# Multiply by b[2]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc2, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	adcx	$acc1, $acc0
+	adox	$acc1, $acc1
+	adc	\$0, $acc1
+
+	########################################################################
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	mov	8*3($b_ptr), %rdx
+	adcx	%rax, $acc0
+	adox	%rax, $acc1
+	adc	\$0, $acc1
+	xor	$acc2 ,$acc2		# $acc2=0,cf=0,of=0
+	########################################################################
+	# Multiply by b[3]
+	mulx	8*0+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	8*1+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+
+	mulx	8*2+128($a_ptr), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+
+	mulx	8*3+128($a_ptr), $t0, $t1
+	 mov	$acc3, %rdx
+	mulx	.LordK(%rip), %rdx, $t4
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+
+	adcx	$acc2, $acc1
+	adox	$acc2, $acc2
+	adc	\$0, $acc2
+
+	########################################################################
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc5
+	adox	$t1, $acc0
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	adcx	%rax, $acc1
+	adox	%rax, $acc2
+	adc	\$0, $acc2
+
+	########################################################################
+	# Branch-less conditional subtraction of P
+	xor	%eax, %eax
+	 mov	$acc4, $t2
+	 mov	$acc5, $t3
+	 mov	$acc0, $t0
+	 mov	$acc1, $t1
+	sbb	8*0+.Lord(%rip), $acc4		# .Lpoly[0]
+	sbb	8*1+.Lord(%rip), $acc5		# .Lpoly[1]
+	sbb	8*2+.Lord(%rip), $acc0		# .Lpoly[1]
+	sbb	8*3+.Lord(%rip), $acc1		# .Lpoly[1]
+	sbb	\$0, $acc2
+
+	cmovc	$t2, $acc4
+	cmovc	$t3, $acc5
+	mov	$acc4, 8*0($r_ptr)
+	cmovc	$t0, $acc0
+	mov	$acc5, 8*1($r_ptr)
+	cmovc	$t1, $acc1
+	mov	$acc0, 8*2($r_ptr)
+	mov	$acc1, 8*3($r_ptr)
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+.size	ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx
+################################################################################
+___
+$code.=<<___;
+# void ecp_nistz256_ord_sqr_mont(
+#   uint64_t res[4],
+#   uint64_t a[4],
+#   int rep);
+
+.globl	ecp_nistz256_ord_sqr_mont
+.type	ecp_nistz256_ord_sqr_mont,\@function,3
+.align	32
+ecp_nistz256_ord_sqr_mont:
+
+___
+$code.=<<___	if ($addx);
+	mov	\$0x80100, %ecx
+	and	OPENSSL_ia32cap_P+8(%rip), %ecx
+	cmp	\$0x80100, %ecx
+	je  ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	.LordK(%rip), %r15
+
+	mov	$b_org, %r14
+
+1:
+	# y[1:] * y[0]
+	mov	8*0($a_ptr), $t0
+
+	mov	8*1($a_ptr), $t4
+	mul	$t0
+	mov	$t4, $acc1
+	mov	$t3, $acc2
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $acc3
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $acc4
+	# y[2:] * y[1]
+	mov	8*1($a_ptr), $t0
+
+	mov	8*2($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t1, $acc4
+	adc	\$0, $t3
+	add	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $acc5
+	# y[3] * y[2]
+	mov	8*2($a_ptr), $t0
+
+	mov	8*3($a_ptr), $t4
+	mul	$t0
+	add	$t4, $acc5
+	adc	\$0, $t3
+	mov	$t3, $b_ptr
+	xor	$t1, $t1
+	# *2
+	add	$acc1, $acc1
+	adc	$acc2, $acc2
+	adc	$acc3, $acc3
+	adc	$acc4, $acc4
+	adc	$acc5, $acc5
+	adc	$b_ptr, $b_ptr
+	adc	\$0, $t1
+	# Missing products
+	mov	8*0($a_ptr), $t4
+	mul	$t4
+	mov	$t4, $acc0
+	mov	$t3, $t0
+
+	mov	8*1($a_ptr), $t4
+	mul	$t4
+	add	$t0, $acc1
+	adc	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t0
+
+	mov	8*2($a_ptr), $t4
+	mul	$t4
+	add	$t0, $acc3
+	adc	$t4, $acc4
+	adc	\$0, $t3
+	mov	$t3, $t0
+
+	mov	8*3($a_ptr), $t4
+	mul	$t4
+	add	$t0, $acc5
+	adc	$t4, $b_ptr
+	adc	$t3, $t1
+	mov	$t1, $a_ptr
+
+	# First reduction step
+	mov	$acc0, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc0
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc1
+	adc	\$0, $t3
+	add	$t4, $acc1
+
+	mov	$t0, $t1
+	adc	$t3, $acc2
+	adc	\$0, $t1
+	sub	$t0, $acc2
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $t3
+	mov	$t0, $acc0
+	shl	\$32, $t4
+	shr	\$32, $t3
+
+	add	$t1, $acc3
+	adc	\$0, $acc0
+	sub	$t4, $acc3
+	sbb	$t3, $acc0
+
+	# Second reduction step
+	mov	$acc1, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc1
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc2
+	adc	\$0, $t3
+	add	$t4, $acc2
+
+	mov	$t0, $t1
+	adc	$t3, $acc3
+	adc	\$0, $t1
+	sub	$t0, $acc3
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $t3
+	mov	$t0, $acc1
+	shl	\$32, $t4
+	shr	\$32, $t3
+
+	add	$t1, $acc0
+	adc	\$0, $acc1
+	sub	$t4, $acc0
+	sbb	$t3, $acc1
+
+	# Third reduction step
+	mov	$acc2, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc2
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc3
+	adc	\$0, $t3
+	add	$t4, $acc3
+
+	mov	$t0, $t1
+	adc	$t3, $acc0
+	adc	\$0, $t1
+	sub	$t0, $acc0
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $t3
+	mov	$t0, $acc2
+	shl	\$32, $t4
+	shr	\$32, $t3
+
+	add	$t1, $acc1
+	adc	\$0, $acc2
+	sub	$t4, $acc1
+	sbb	$t3, $acc2
+
+	# Last reduction step
+	mov	$acc3, $t4
+	mulq	%r15
+	mov	$t4, $t0
+
+	mov	8*0+.Lord(%rip), $t4
+	mul	$t0
+	add	$t4, $acc3
+	adc	\$0, $t3
+	mov	$t3, $t1
+
+	mov	8*1+.Lord(%rip), $t4
+	mul	$t0
+	add	$t1, $acc0
+	adc	\$0, $t3
+	add	$t4, $acc0
+
+	mov	$t0, $t1
+	adc	$t3, $acc1
+	adc	\$0, $t1
+	sub	$t0, $acc1
+	sbb	\$0, $t1
+
+	mov	$t0, $t4
+	mov	$t0, $acc3
+	shl	\$32, $t4
+	shr	\$32, $t0
+
+	add	$t1, $acc2
+	adc	\$0, $acc3
+	sub	$t4, $acc2
+	sbb	$t0, $acc3
+	xor	$t0, $t0
+	# Add bits [511:256] of	the sqr result
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	adc	$b_ptr, $acc2
+	adc	$a_ptr, $acc3
+	adc	\$0, $t0
+
+	mov	$acc0, $acc4
+	mov	$acc1, $acc5
+	mov	$acc2, $b_ptr
+	mov	$acc3, $t1
+	# Subtract p256
+	sub	8*0+.Lord(%rip), $acc0
+	sbb	8*1+.Lord(%rip), $acc1
+	sbb	8*2+.Lord(%rip), $acc2
+	sbb	8*3+.Lord(%rip), $acc3
+	sbb	\$0, $t0
+
+	cmovc	$acc4, $acc0
+	cmovc	$acc5, $acc1
+	cmovc	$b_ptr, $acc2
+	cmovc	$t1, $acc3
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	$acc1, 8*1($r_ptr)
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
+	mov	$r_ptr, $a_ptr
+	dec	%r14
+	jne	1b
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+.size	ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont
+___
+$code.=<<___	if ($addx);
+.align	32
+ecp_nistz256_ord_sqr_montx:
+
+	push	%rbp
+	push	%rbx
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	$b_org, $t2
+	lea	-128($a_ptr), $a_ptr	# control u-op density
+1:
+	mov	8*0+128($a_ptr), %rdx
+	mov	8*1+128($a_ptr), $acc6
+	mov	8*2+128($a_ptr), $acc7
+	mov	8*3+128($a_ptr), $acc0
+
+	mulx	$acc6, $acc1, $acc2	# a[0]*a[1]
+	mulx	$acc7, $t0, $acc3	# a[0]*a[2]
+	xor	%eax, %eax
+	adc	$t0, $acc2
+	mulx	$acc0, $t1, $acc4	# a[0]*a[3]
+	 mov	$acc6, %rdx
+	adc	$t1, $acc3
+	adc	\$0, $acc4
+	xor	$acc5, $acc5		# $acc5=0,cf=0,of=0
+	#################################
+	mulx	$acc7, $t0, $t1		# a[1]*a[2]
+	adcx	$t0, $acc3
+	adox	$t1, $acc4
+
+	mulx	$acc0, $t0, $t1		# a[1]*a[3]
+	 mov	$acc7, %rdx
+	adcx	$t0, $acc4
+	adox	$t1, $acc5
+	adc	\$0, $acc5
+	#################################
+	mulx	$acc0, $t0, $acc6	# a[2]*a[3]
+	 mov	8*0+128($a_ptr), %rdx
+	xor	$acc7, $acc7		# $acc7=0,cf=0,of=0
+	 adcx	$acc1, $acc1		# acc1:6<<1
+	adox	$t0, $acc5
+	 adcx	$acc2, $acc2
+	adox	$acc7, $acc6		# of=0
+
+	mulx	%rdx, $acc0, $t1
+	mov	8*1+128($a_ptr), %rdx
+	 adcx	$acc3, $acc3
+	adox	$t1, $acc1
+	 adcx	$acc4, $acc4
+	mulx	%rdx, $t0, $t4
+	mov	8*2+128($a_ptr), %rdx
+	 adcx	$acc5, $acc5
+	adox	$t0, $acc2
+	 adcx	$acc6, $acc6
+	.byte	0x67
+	mulx	%rdx, $t0, $t1
+	mov	8*3+128($a_ptr), %rdx
+	adox	$t4, $acc3
+	 adcx	$acc7, $acc7
+	adox	$t0, $acc4
+	adox	$t1, $acc5
+	.byte	0x67,0x67
+	mulx	%rdx, $t0, $t4
+	adox	$t0, $acc6
+	adox	$t4, $acc7
+
+	#reduce
+	mov	$acc0, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	xor	%eax, %eax
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	adcx	%rax, $acc0
+	#################################
+	mov	$acc1, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	adcx	%rax, $acc1
+	#################################
+	mov	$acc2, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	adcx	%rax, $acc2
+	#################################
+	mov	$acc3, %rdx
+	mulx	.LordK(%rip), %rdx, $t0
+
+	mulx	8*0+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc3
+	adox	$t1, $acc0
+	mulx	8*1+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc0
+	adox	$t1, $acc1
+	mulx	8*2+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc1
+	adox	$t1, $acc2
+	mulx	8*3+.Lord(%rip), $t0, $t1
+	adcx	$t0, $acc2
+	adox	$t1, $acc3
+	adcx	%rax, $acc3
+
+	xor	$t0, $t0
+	add	$acc4, $acc0
+	adc	$acc5, $acc1
+	adc	$acc6, $acc2
+	adc	$acc7, $acc3
+	adc	\$0, $t0
+
+	mov	$acc0, $acc4
+	mov	$acc1, $acc5
+	mov	$acc2, $acc6
+	mov	$acc3, $acc7
+	# Subtract p256
+	sub	8*0+.Lord(%rip), $acc0
+	sbb	8*1+.Lord(%rip), $acc1
+	sbb	8*2+.Lord(%rip), $acc2
+	sbb	8*3+.Lord(%rip), $acc3
+	sbb	\$0, $t0
+
+	cmovc	$acc4, $acc0
+	cmovc	$acc5, $acc1
+	cmovc	$acc6, $acc2
+	cmovc	$acc7, $acc3
+
+	mov	$acc0, 8*0($r_ptr)
+	mov	$acc1, 8*1($r_ptr)
+	mov	$acc2, 8*2($r_ptr)
+	mov	$acc3, 8*3($r_ptr)
+
+	lea	-128($r_ptr), $a_ptr
+
+	dec	$t2
+	jne	1b
+
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbx
+	pop	%rbp
+	ret
+
+.size	ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx
+___
+$code.=<<___;
+################################################################################
 # void ecp_nistz256_to_mont(
 #   uint64_t res[4],
 #   uint64_t in[4]);
diff --git a/crypto/ec/ec_lcl.h b/crypto/ec/ec_lcl.h
index 9db7106..a3d4ba6 100644
--- a/crypto/ec/ec_lcl.h
+++ b/crypto/ec/ec_lcl.h
@@ -196,6 +196,9 @@ struct ec_method_st {
     int (*field_decode) (const EC_GROUP *, BIGNUM *r, const BIGNUM *a,
                          BN_CTX *);
     int (*field_set_to_one) (const EC_GROUP *, BIGNUM *r, BN_CTX *);
+
+    /* Inverse modulo order */
+    int (*field_inverse_mod_ord) (const EC_GROUP *, BIGNUM *r, BIGNUM *x, BN_CTX *ctx);
 } /* EC_METHOD */ ;
 
 typedef struct ec_extra_data_st {
diff --git a/crypto/ec/ec_lib.c b/crypto/ec/ec_lib.c
index 0e850d6..d0e7338 100644
--- a/crypto/ec/ec_lib.c
+++ b/crypto/ec/ec_lib.c
@@ -351,6 +351,13 @@ int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx)
     return !BN_is_zero(order);
 }
 
+int EC_GROUP_do_inverse_ord(const EC_GROUP *group, BIGNUM *res, BIGNUM *x, BN_CTX *ctx) {
+    if (group->meth->field_inverse_mod_ord != 0) {
+        return group->meth->field_inverse_mod_ord(group, res, x, ctx);
+    }
+    else return 0;
+}
+
 int EC_GROUP_get_cofactor(const EC_GROUP *group, BIGNUM *cofactor,
                           BN_CTX *ctx)
 {
diff --git a/crypto/ec/ecp_nistz256.c b/crypto/ec/ecp_nistz256.c
index de9fbea..7b3dde8 100644
--- a/crypto/ec/ecp_nistz256.c
+++ b/crypto/ec/ecp_nistz256.c
@@ -1,6 +1,12 @@
 /******************************************************************************
  *                                                                            *
- * Copyright 2014 Intel Corporation                                           *
+ * Copyright (c) 2015 Intel Corporation                                       *
+ * Copyright (c) 2015 CloudFlare, Inc.                                        *
+ * All rights reserved.                                                       *
+ *                                                                            *
+ * This software is dual licensed under the Apache V.2.0 and BSD licenses     *
+ *                                                                            *
+ ******************************************************************************
  *                                                                            *
  * Licensed under the Apache License, Version 2.0 (the "License");            *
  * you may not use this file except in compliance with the License.           *
@@ -16,10 +22,41 @@
  *                                                                            *
  ******************************************************************************
  *                                                                            *
+ *  Redistribution and use in source and binary forms, with or without        *
+ *  modification, are permitted provided that the following conditions are    *
+ *  met:                                                                      *
+ *                                                                            *
+ *  1. Redistributions of source code must retain the above copyright         *
+ *     notice, this list of conditions and the following disclaimer.          *
+ *                                                                            *
+ *  2. Redistributions in binary form must reproduce the above copyright      *
+ *     notice, this list of conditions and the following disclaimer in the    *
+ *     documentation and/or other materials provided with the                 *
+ *     distribution.                                                          *
+ *                                                                            *
+ *  3. Neither the name of the copyright holders nor the names of its         *
+ *     contributors may be used to endorse or promote products derived from   *
+ *     this software without specific prior written permission.               *
+ *                                                                            *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS       *
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
+ *  TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR*
+ *  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR         *
+ *  CONTRIBUTORS  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,    *
+ *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
+ *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
+ *                                                                            *
+ ******************************************************************************
+ *                                                                            *
  * Developers and authors:                                                    *
- * Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+ * Shay Gueron (1, 2), and Vlad Krasnov (1, 3)                                *
  * (1) Intel Corporation, Israel Development Center                           *
  * (2) University of Haifa                                                    *
+ * (3) CloudFlare, Inc.                                                       *
  * Reference:                                                                 *
  * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with *
  *                          256 Bit Primes"                                   *
@@ -102,6 +139,13 @@ void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]);
 void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS],
                            const BN_ULONG b[P256_LIMBS]);
+/* Montgomery mul modulo Order(P): res = a*b*2^-256 mod Order(P) */
+void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS],
+                           const BN_ULONG b[P256_LIMBS]);
+void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS],
+                           const BN_ULONG a[P256_LIMBS],
+                           int rep);
 /* Montgomery sqr: res = a*a*2^-256 mod P */
 void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS],
                            const BN_ULONG a[P256_LIMBS]);
@@ -1341,6 +1385,72 @@ static int ecp_nistz256_points_mul(const EC_GROUP *group,
     return ret;
 }
 
+#ifdef __x86_64
+int ecp_nistz256_inv_mod_ord(const EC_GROUP *group, BIGNUM *r, BIGNUM *x, BN_CTX *ctx)
+{
+    BN_ULONG table[P256_LIMBS*15];
+    const BN_ULONG RR[P256_LIMBS] = {TOBN(0x83244c95,0xbe79eea2), TOBN(0x4699799c,0x49bd6fa6), TOBN(0x2845b239,0x2b6bec59), TOBN(0x66e12d94,0xf3d95620)};
+    const BN_ULONG ONE[P256_LIMBS] = {TOBN(0,1),TOBN(0,0),TOBN(0,0),TOBN(0,0)};
+    BN_ULONG out[P256_LIMBS], t[P256_LIMBS];
+    const unsigned char expLo[32] = {0xb,0xc,0xe,0x6,0xf,0xa,0xa,0xd,0xa,0x7,0x1,0x7,0x9,0xe,0x8,0x4,0xf,0x3,0xb,0x9,0xc,0xa,0xc,0x2,0xf,0xc,0x6,0x3,0x2,0x5,0x4,0xf};
+    int i, ret = 0;
+    BIGNUM *tmp;
+    // We don't use entry 0 in the table, so we omit it and address with -1 offset
+    if ((BN_num_bits(x) > 256)
+        || BN_is_negative(x)) {
+        if ((tmp = BN_CTX_get(ctx)) == NULL)
+            goto err;
+
+        if (!BN_nnmod(tmp, x, group->order, ctx)) {
+            goto err;
+        }
+        x = tmp;
+    }
+    // The exponent is public, no need in constant time access
+    ecp_nistz256_ord_mul_mont(&table[0*P256_LIMBS], bn_get_words(x), RR);
+    for ( i = 2; i < 16; i+=2 ) {
+        ecp_nistz256_ord_sqr_mont(&table[(i-1)*P256_LIMBS], &table[(i/2-1)*P256_LIMBS], 1);
+        ecp_nistz256_ord_mul_mont(&table[i*P256_LIMBS], &table[(i-1)*P256_LIMBS], &table[0*P256_LIMBS]);
+    }
+    // The top 128bit of the exponent are highly redudndant, so we perform an optimized flow
+    memcpy(out, &table[(15-1)*P256_LIMBS], sizeof(out));             //f
+
+    ecp_nistz256_ord_sqr_mont(out, out, 4);                          //f0
+    ecp_nistz256_ord_mul_mont(out, out, &table[(15-1)*P256_LIMBS]);  //ff
+    memcpy(t, out, sizeof(t));
+
+    ecp_nistz256_ord_sqr_mont(out, out, 8);                          //ff00
+    ecp_nistz256_ord_mul_mont(out, out, t);                          //ffff
+    memcpy(t, out, sizeof(t));
+
+    ecp_nistz256_ord_sqr_mont(out, out, 16);                         //ffff0000
+    ecp_nistz256_ord_mul_mont(out, out, t);                          //ffffffff
+    memcpy(t, out, sizeof(t));
+
+    ecp_nistz256_ord_sqr_mont(out, out, 64);                         //ffffffff0000000000000000
+    ecp_nistz256_ord_mul_mont(out, out, t);                          //ffffffff00000000ffffffff
+
+    ecp_nistz256_ord_sqr_mont(out, out, 32);                         //ffffffff00000000ffffffff00000000
+    ecp_nistz256_ord_mul_mont(out, out, t);                          //ffffffff00000000ffffffffffffffff
+
+    // The bottom 128 bit of the exponent are easier done with a table
+    for( i = 0; i < 32; i++ ) {
+        ecp_nistz256_ord_sqr_mont(out, out, 4);
+        ecp_nistz256_ord_mul_mont(out, out, &table[(expLo[i]-1)*P256_LIMBS]);
+    }
+
+    ecp_nistz256_ord_mul_mont(out, out, ONE);
+
+    bn_wexpand(r, P256_LIMBS);
+    bn_set_top(r, P256_LIMBS);
+    bn_set_data(r, out, sizeof(out));
+    bn_correct_top(r);
+    ret = 1;
+err:
+    return ret;
+}
+#endif
+
 static int ecp_nistz256_get_affine(const EC_GROUP *group,
                                    const EC_POINT *point,
                                    BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
@@ -1510,7 +1620,12 @@ const EC_METHOD *EC_GFp_nistz256_method(void)
         0,                                          /* field_div */
         ec_GFp_mont_field_encode,
         ec_GFp_mont_field_decode,
-        ec_GFp_mont_field_set_to_one
+        ec_GFp_mont_field_set_to_one,
+#ifdef __x86_64
+        ecp_nistz256_inv_mod_ord
+#else
+	0
+#endif
     };
 
     return &ret;
diff --git a/crypto/ecdsa/ecs_ossl.c b/crypto/ecdsa/ecs_ossl.c
index ce2973d..0194aed 100644
--- a/crypto/ecdsa/ecs_ossl.c
+++ b/crypto/ecdsa/ecs_ossl.c
@@ -158,9 +158,10 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
          * compute G*k using an equivalent scalar of fixed bit-length.
          */
 
-        if (!BN_add(k, k, order))
-            goto err;
-        if (BN_num_bits(k) <= BN_num_bits(order))
+        if (BN_num_bits(k) < BN_num_bits(order))
+            if (!BN_add(k, k, order))
+                goto err;
+        if (BN_num_bits(k) < BN_num_bits(order))
             if (!BN_add(k, k, order))
                 goto err;
 
@@ -195,30 +196,33 @@ static int ecdsa_sign_setup(EC_KEY *eckey, BN_CTX *ctx_in,
     }
     while (BN_is_zero(r));
 
-    /* compute the inverse of k */
-    if (EC_GROUP_get_mont_data(group) != NULL) {
-        /*
-         * We want inverse in constant time, therefore we utilize the fact
-         * order must be prime and use Fermats Little Theorem instead.
-         */
-        if (!BN_set_word(X, 2)) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        if (!BN_mod_sub(X, order, X, order, ctx)) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-        BN_set_flags(X, BN_FLG_CONSTTIME);
-        if (!BN_mod_exp_mont_consttime
-            (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
-        }
-    } else {
-        if (!BN_mod_inverse(k, k, order, ctx)) {
-            ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
-            goto err;
+    /* Check if optimized inverse is implemented */
+    if (EC_GROUP_do_inverse_ord(group, k, k, ctx) == 0) {
+        /* compute the inverse of k */
+        if (EC_GROUP_get_mont_data(group) != NULL) {
+            /*
+             * We want inverse in constant time, therefore we utilize the fact
+             * order must be prime and use Fermats Little Theorem instead.
+             */
+            if (!BN_set_word(X, 2)) {
+                ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+            if (!BN_mod_sub(X, order, X, order, ctx)) {
+                ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+            BN_set_flags(X, BN_FLG_CONSTTIME);
+            if (!BN_mod_exp_mont_consttime
+                (k, k, X, order, ctx, EC_GROUP_get_mont_data(group))) {
+                ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
+        } else {
+            if (!BN_mod_inverse(k, k, order, ctx)) {
+                ECDSAerr(ECDSA_F_ECDSA_SIGN_SETUP, ERR_R_BN_LIB);
+                goto err;
+            }
         }
     }
 
@@ -410,9 +414,12 @@ static int ecdsa_do_verify(const unsigned char *dgst, int dgst_len,
         goto err;
     }
     /* calculate tmp1 = inv(S) mod order */
-    if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
-        ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
-        goto err;
+    /* Check if optimized inverse is implemented */
+    if (EC_GROUP_do_inverse_ord(group, u2, sig->s) == 0) {
+        if (!BN_mod_inverse(u2, sig->s, order, ctx)) {
+            ECDSAerr(ECDSA_F_ECDSA_DO_VERIFY, ERR_R_BN_LIB);
+            goto err;
+        }
     }
     /* digest -> m */
     i = BN_num_bits(order);