> > As someone pointed out there're machines with sizeof(int)==8 out there.
> > So I'd like to reserve some extra time for elaborating on the patch by
> > redefining BF_[M0-3] macros. If you allow I can also come up with some
> > alternative for #ifdef spaghetti in the beginning of
> > crypto/bf/bf_locl.org.
>
> version from ftp://ftp.openssl.org/snapshot/openssl-SNAP-19990421.tar.gz
Find attached patch relative to the mentioned snapshot. Comment about
#ifdef spaghetti (that was moved from crypto/bf/bf_locl.org to
crypto/opensslconf.h.in by Ulf). I don't see any need for it, so I've
folded the whole mumbo-jumbo to #undef BF_PTR:-) Indeed, good optimizing
compiler should be perfectly able to deduct both BF_PTR and BF_PTR2
versions from the generic one. People with poor compilers (read gcc:-)
would have to experiment in either case and would come across the
comments in bf_locl.h...
BTW. Why doesn't one turbocharge it (well, probably other algorithms
used by Netscape more extensively should be the first target:-) by
passing and receiving data block to be en-/decrypted by value instead of
by reference? I mean like this:
BF_LONG_LONG BF_encrypt (BF_LONG l,BF_LONG r,BF_KEY *key)
Instead of this:
void BF_encrypt (BF_LONG *data,BF_KEY *key)
Well, it wouldn't make hell of a difference on Intel as arguments has to
be *written* into memory (stack or array) in either case, but on RISC it
could be a big hit! Or is nobody interested in anything but Intel as
always? And yes, I realize it could be a pain in the ass:-) Especially
receiving the result part...
> > SHA might need extra consideration too then...
Is coming soon. It will very likely be followed by a number of patches
to other digest algorithms. Idea is to clean up this #ifdef *_ENDIAN
mess, that apparently was used to mask the code that doesn't work on
64-bit little-endian platforms (a.k.a. Alpha-based).
Cheers. Andy.
*** ./crypto/bf/blowfish.h.orig Tue Apr 20 18:00:10 1999
--- ./crypto/bf/blowfish.h Mon Apr 26 19:24:15 1999
***************
*** 66,73 ****
#define BF_ENCRYPT 1
#define BF_DECRYPT 0
! #ifdef WIN16
#define BF_LONG unsigned long
#else
#define BF_LONG unsigned int
#endif
--- 66,90 ----
#define BF_ENCRYPT 1
#define BF_DECRYPT 0
! /*
! * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! * ! BF_LONG has to be at least 32 bits wide. If it's wider, then !
! * ! BF_LONG_LOG2 has to be defined along. !
! * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! */
!
! #if defined(WIN16) || defined(__LP32__)
#define BF_LONG unsigned long
+ #elif defined(_CRAY) || defined(__ILP64__)
+ #define BF_LONG unsigned long
+ #define BF_LONG_LOG2 3
+ /*
+ * _CRAY note. I could declare short, but I have no idea what impact
+ * does it have on performance on none-T3E machines. I could declare
+ * int, but at least on C90 sizeof(int) can be chosen at compile time.
+ * So I've chosen long...
+ * <[EMAIL PROTECTED]>
+ */
#else
#define BF_LONG unsigned int
#endif
*** ./crypto/bf/bf_locl.h.orig Wed Apr 21 19:30:49 1999
--- ./crypto/bf/bf_locl.h Mon Apr 26 19:21:39 1999
***************
*** 151,206 ****
/* This is actually a big endian algorithm, the most significate byte
* is used to lookup array 0 */
- #define BF_M 0x3fc
- #define BF_0 22L
- #define BF_1 14L
- #define BF_2 6L
- #define BF_3 2L /* left shift */
-
#if defined(BF_PTR2)
! /* This is basically a special pentium verson */
! #define BF_ENC(LL,R,S,P) \
! { \
! BF_LONG t,u,v; \
! u=R>>BF_0; \
! v=R>>BF_1; \
! u&=BF_M; \
! v&=BF_M; \
! t= *(BF_LONG *)((unsigned char *)&(S[ 0])+u); \
! u=R>>BF_2; \
! t+= *(BF_LONG *)((unsigned char *)&(S[256])+v); \
! v=R<<BF_3; \
! u&=BF_M; \
! v&=BF_M; \
! t^= *(BF_LONG *)((unsigned char *)&(S[512])+u); \
! LL^=P; \
! t+= *(BF_LONG *)((unsigned char *)&(S[768])+v); \
! LL^=t; \
! }
#elif defined(BF_PTR)
! /* This is normally very good */
! #define BF_ENC(LL,R,S,P) \
! LL^=P; \
LL^= (((*(BF_LONG *)((unsigned char *)&(S[ 0])+((R>>BF_0)&BF_M))+ \
*(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \
*(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \
! *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M)));
#else
! /* This will always work, even on 64 bit machines and strangly enough,
! * on the Alpha it is faster than the pointer versions (both 32 and 64
! * versions of BF_LONG) */
! #define BF_ENC(LL,R,S,P) \
! LL^=P; \
! LL^=((( S[ (int)(R>>24L) ] + \
! S[0x0100+((int)(R>>16L)&0xff)])^ \
! S[0x0200+((int)(R>> 8L)&0xff)])+ \
! S[0x0300+((int)(R )&0xff)])&0xffffffffL;
#endif
#endif
--- 151,219 ----
/* This is actually a big endian algorithm, the most significate byte
* is used to lookup array 0 */
#if defined(BF_PTR2)
! /*
! * This is basically a special Intel version. Point is that Intel
! * doesn't have many registers, but offers a reach choice of addressing
! * modes. So we spare some registers by directly traversing BF_KEY
! * structure and hiring the most decorated addressing mode. The code
! * generated by EGCS is *perfectly* competitive with assembler
! * implementation!
! */
! #define BF_ENC(LL,R,KEY,Pi) (\
! LL^=KEY[Pi], \
! t= KEY[BF_ROUNDS+2 + 0 + ((R>>24)&0xFF)], \
! t+= KEY[BF_ROUNDS+2 + 256 + ((R>>16)&0xFF)], \
! t^= KEY[BF_ROUNDS+2 + 512 + ((R>>8 )&0xFF)], \
! t+= KEY[BF_ROUNDS+2 + 768 + ((R )&0xFF)], \
! LL^=t \
! )
#elif defined(BF_PTR)
! #ifndef BF_LONG_LOG2
! #define BF_LONG_LOG2 2 /* default to BF_LONG being 32 bits */
! #endif
! #define BF_M (0xFF<<BF_LONG_LOG2)
! #define BF_0 (24-BF_LONG_LOG2)
! #define BF_1 (16-BF_LONG_LOG2)
! #define BF_2 ( 8-BF_LONG_LOG2)
! #define BF_3 BF_LONG_LOG2 /* left shift */
! /*
! * This is normally very good on RISC platforms where normally you
! * have to explicitely "multiplicate" array index by sizeof(BF_LONG)
! * in order to caclulate the effective address. This implementation
! * excuses CPU from this extra work. Power[PC] uses should have most
! * fun as (R>>BF_i)&BF_M gets folded into a single instruction, namely
! * rlwinm. So let'em double-check if their compiler does it.
! */
!
! #define BF_ENC(LL,R,S,P) ( \
! LL^=P, \
LL^= (((*(BF_LONG *)((unsigned char *)&(S[ 0])+((R>>BF_0)&BF_M))+ \
*(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \
*(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \
! *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M))) \
! )
#else
! /*
! * This is a *generic* version. Seem to perform best on platforms that
! * offer explicit support for extraction of 8-bit nibbles preferably
! * complemented with "multiplying" of array index by sizeof(BF_LONG).
! * For the moment of this writing the list comprises Alpha CPU featuring
! * extbl and s[48]addq instructions.
! */
! #define BF_ENC(LL,R,S,P) ( \
! LL^=P, \
! LL^=((( S[ ((int)(R>>24)&0xff)] + \
! S[0x0100+((int)(R>>16)&0xff)])^ \
! S[0x0200+((int)(R>> 8)&0xff)])+ \
! S[0x0300+((int)(R )&0xff)])&0xffffffffL \
! )
#endif
#endif
*** ./crypto/bf/bf_enc.c.orig Tue Apr 20 00:00:16 1999
--- ./crypto/bf/bf_enc.c Mon Apr 26 18:36:43 1999
***************
*** 71,76 ****
--- 71,77 ----
void BF_encrypt(BF_LONG *data, BF_KEY *key)
{
+ #ifndef BF_PTR2
register BF_LONG l,r,*p,*s;
p=key->P;
***************
*** 105,110 ****
--- 106,146 ----
data[1]=l&0xffffffffL;
data[0]=r&0xffffffffL;
+ #else
+ register BF_LONG l,r,t,*k;
+
+ l=data[0];
+ r=data[1];
+ k=(BF_LONG*)key;
+
+ l^=k[0];
+ BF_ENC(r,l,k, 1);
+ BF_ENC(l,r,k, 2);
+ BF_ENC(r,l,k, 3);
+ BF_ENC(l,r,k, 4);
+ BF_ENC(r,l,k, 5);
+ BF_ENC(l,r,k, 6);
+ BF_ENC(r,l,k, 7);
+ BF_ENC(l,r,k, 8);
+ BF_ENC(r,l,k, 9);
+ BF_ENC(l,r,k,10);
+ BF_ENC(r,l,k,11);
+ BF_ENC(l,r,k,12);
+ BF_ENC(r,l,k,13);
+ BF_ENC(l,r,k,14);
+ BF_ENC(r,l,k,15);
+ BF_ENC(l,r,k,16);
+ #if BF_ROUNDS == 20
+ BF_ENC(r,l,k,17);
+ BF_ENC(l,r,k,18);
+ BF_ENC(r,l,k,19);
+ BF_ENC(l,r,k,20);
+ #endif
+ r^=k[BF_ROUNDS+1];
+
+ data[1]=l&0xffffffffL;
+ data[0]=r&0xffffffffL;
+ #endif
}
#ifndef BF_DEFAULT_OPTIONS
***************
*** 111,116 ****
--- 147,153 ----
void BF_decrypt(BF_LONG *data, BF_KEY *key)
{
+ #ifndef BF_PTR2
register BF_LONG l,r,*p,*s;
p=key->P;
***************
*** 145,150 ****
--- 182,222 ----
data[1]=l&0xffffffffL;
data[0]=r&0xffffffffL;
+ #else
+ register BF_LONG l,r,t,*k;
+
+ l=data[0];
+ r=data[1];
+ k=(BF_LONG *)key;
+
+ l^=k[BF_ROUNDS+1];
+ #if BF_ROUNDS == 20
+ BF_ENC(r,l,k,20);
+ BF_ENC(l,r,k,19);
+ BF_ENC(r,l,k,18);
+ BF_ENC(l,r,k,17);
+ #endif
+ BF_ENC(r,l,k,16);
+ BF_ENC(l,r,k,15);
+ BF_ENC(r,l,k,14);
+ BF_ENC(l,r,k,13);
+ BF_ENC(r,l,k,12);
+ BF_ENC(l,r,k,11);
+ BF_ENC(r,l,k,10);
+ BF_ENC(l,r,k, 9);
+ BF_ENC(r,l,k, 8);
+ BF_ENC(l,r,k, 7);
+ BF_ENC(r,l,k, 6);
+ BF_ENC(l,r,k, 5);
+ BF_ENC(r,l,k, 4);
+ BF_ENC(l,r,k, 3);
+ BF_ENC(r,l,k, 2);
+ BF_ENC(l,r,k, 1);
+ r^=k[0];
+
+ data[1]=l&0xffffffffL;
+ data[0]=r&0xffffffffL;
+ #endif
}
void BF_cbc_encrypt(unsigned char *in, unsigned char *out, long length,
*** ./crypto/opensslconf.h.in.orig Wed Apr 21 19:33:52 1999
--- ./crypto/opensslconf.h.in Mon Apr 26 19:28:29 1999
***************
*** 54,80 ****
#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
#define CONFIG_HEADER_BF_LOCL_H
! /* Special defines which change the way the code is built depending on the
! CPU and OS. For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
! even newer MIPS CPU's, but at the moment one size fits all for
! optimization options. Older Sparc's work better with only UNROLL, but
! there's no way to tell at compile time what it is you're running on */
!
! #if defined( sun ) /* Newer Sparc's */
! # define BF_PTR
! #elif defined( __ultrix ) /* Older MIPS */
! # define BF_PTR
! #elif defined( __sgi ) /* Newer MIPS */
! # define BF_PTR
! #endif /* Systems-specific speed defines */
!
! /* use BF_PTR2 for intel boxes,
! * BF_PTR for sparc and MIPS/SGI
! * use nothing for Alpha and HP.
! */
! #if !defined(BF_PTR) && !defined(BF_PTR2)
! #define BF_PTR2
! #endif
#endif /* HEADER_BF_LOCL_H */
#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)
--- 54,60 ----
#if defined(HEADER_BF_LOCL_H) && !defined(CONFIG_HEADER_BF_LOCL_H)
#define CONFIG_HEADER_BF_LOCL_H
! #undef BF_PTR
#endif /* HEADER_BF_LOCL_H */
#if defined(HEADER_DES_LOCL_H) && !defined(CONFIG_HEADER_DES_LOCL_H)