Sorry, I forgot to attach the patch. Should be attached now.
On Fri, 18 Mar 2022, Elijah Stone wrote:
What was I thinking? i. is much more prestigious than /:
x=: ?1e8$65536
y=: 0+x NB. avoid selfie optimizations. The effect is still
present, but less pronounced
x4=: 10 u:x
y4=: 10 u:y
x2=: 2 u:x4
y2=: 2 u:y4
timex'x i.y'
0.728156
timex'x4 i.y4'
3.8804
timex'x2 i.y2'
0.651964
Probably an oversight when 4-byte chars were added, on the theory that
2-byte chars would always get small-range optimization. (Which is, as a
matter of fact, not the case.)
Attached is a patch fixing the problem. This also helps with 4-column
matrices of 1-byte chars etc. (I think there may be reasonably cheap
gains to be made across the board for i. on matrices of few columns, but
at the very least small POT deserves to be fast.) This should also
improve performance for smaller 2-byte arrays that do not make the
small-range cut. With the patch:
timex'x4 i.y4'
0.613455
-E
----------------------------------------------------------------------
For information about J forums see http://www.jsoftware.com/forums.htm
diff --git a/jsrc/viavx.c b/jsrc/viavx.c
index 67a1dc47..f55024ec 100644
--- a/jsrc/viavx.c
+++ b/jsrc/viavx.c
@@ -588,35 +588,40 @@ static __forceinline I icmpeq(I *a, I *w, I n) {
// jtioq RAT
// jtioi1 k==SZI, INT/SBT/char/bool not small-range
// jtioi list of >1 INT
+// jtioCk k-sized bool/char
// jtioc k!=SZI, bool (must be list of em)/char/INT/SBT
// jtioc01 intolerant FL atom
// jtioc0 intolerant FL array
// jtioz01 intolerant CMPX atom
// jtioz0 intolerant CMPX array
-static IOFX(A,US,jtioax1,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(av[hj])),1 ) /*
boxed exact 1-element item */
-static IOFX(A,US,jtioau,, hiau(C(*v)), !equ(C(*v),C(av[hj])),1 ) /* boxed
uniform type */
-static IOFX(X,US,jtiox,, hix(v), !eqx(n,v,av+n*hj),
cn) /* extended integer */
-static IOFX(Q,US,jtioq,, hiq(v), !eqq(n,v,av+n*hj),
cn) /* rational number */
-static IOFX(C,US,jtioc,, hic(k,(UC*)v), memcmpne(v,av+k*hj,k),
cn) /* boolean, char, or integer */
-static IOFX(I,US,jtioi,COMPSETUP, hici(n,v), COMPCALL(av),
cn ) // INT array, not float
-static IOFX(I,US,jtioi1,, hici1(v), *v!=av[hj],
1 ) // len=8, not float
-static IOFX(D,US,jtioc01,, hic01((UIL*)v), *v!=av[hj],
1) // float atom
-static IOFX(Z,US,jtioz01,, hic0(2,(UIL*)v),
(v[0].re!=av[hj].re)||(v[0].im!=av[hj].im), 1) // complex atom
-static IOFX(D,US,jtioc0,, hic0(n,(UIL*)v), fcmp0(v,&av[n*hj],n),
cn) // float array
-static IOFX(Z,US,jtioz0,, hic0(2*n,(UIL*)v),
fcmp0((D*)v,(D*)&av[n*hj],2*n), cn) // complex array
-
-static IOFX(A,UI4,jtioax12,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(av[hj])),1 )
/* boxed exact 1-element item */
-static IOFX(A,UI4,jtioau2,, hiau(C(*v)), !equ(C(*v),C(av[hj])),1 ) /* boxed
uniform type */
-static IOFX(X,UI4,jtiox2,, hix(v), !eqx(n,v,av+n*hj),
cn) /* extended integer */
-static IOFX(Q,UI4,jtioq2,, hiq(v), !eqq(n,v,av+n*hj),
cn) /* rational number */
-static IOFX(C,UI4,jtioc2,, hic(k,(UC*)v), memcmpne(v,av+k*hj,k),
cn) /* boolean, char, or integer */
-static IOFX(I,UI4,jtioi2,COMPSETUP, hici(n,v), COMPCALL(av),
cn ) // INT array, not float
-static IOFX(I,UI4,jtioi12,, hici1(v), *v!=av[hj],
1 ) // len=8, not float
-static IOFX(D,UI4,jtioc012,, hic01((UIL*)v), *v!=av[hj],
1) // float atom
-static IOFX(Z,UI4,jtioz012,, hic0(2,(UIL*)v),
(v[0].re!=av[hj].re)||(v[0].im!=av[hj].im), 1) // complex atom
-static IOFX(D,UI4,jtioc02,, hic0(n,(UIL*)v), fcmp0(v,&av[n*hj],n),
cn) // float array
-static IOFX(Z,UI4,jtioz02,, hic0(2*n,(UIL*)v),
fcmp0((D*)v,(D*)&av[n*hj],2*n), cn) // complex array
+static IOFX(A, US,jtioax1,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(av[hj])),1) //
boxed exact 1-element item
+static IOFX(A, US,jtioau,, hiau(C(*v)), !equ(C(*v),C(av[hj])), 1) //
boxed uniform type
+static IOFX(X, US,jtiox,, hix(v), !eqx(n,v,av+n*hj), cn) //
extended integer
+static IOFX(Q, US,jtioq,, hiq(v), !eqq(n,v,av+n*hj), cn) //
rational number
+static IOFX(C, US,jtioc,, hic(k,(UC*)v), memcmpne(v,av+k*hj,k), cn) //
boolean, char, or integer
+static IOFX(I, US,jtioi,COMPSETUP,hici(n,v),COMPCALL(av), cn) //
INT array, not float
+static IOFX(C2,US,jtioC2,, hici1((C2*)v), *v!=av[hj], 1) //
2-byte (char)
+static IOFX(C4,US,jtioC4,, hici1((C4*)v), *v!=av[hj], 1) //
4-byte (char)
+static IOFX(I, US,jtioi1,, hici1(v), *v!=av[hj], 1) //
len=8, not float
+static IOFX(D, US,jtioc01,,hic01((UIL*)v), *v!=av[hj], 1) //
float atom
+static IOFX(Z, US,jtioz01,,hic0(2,(UIL*)v),
(v[0].re!=av[hj].re)||(v[0].im!=av[hj].im), 1) // complex atom
+static IOFX(D, US,jtioc0,, hic0(n,(UIL*)v), fcmp0(v,&av[n*hj],n), cn) //
float array
+static IOFX(Z, US,jtioz0,,
hic0(2*n,(UIL*)v),fcmp0((D*)v,(D*)&av[n*hj],2*n),cn) // complex array
+
+static IOFX(A, UI4,jtioax12,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(av[hj])),1) //
boxed exact 1-element item
+static IOFX(A, UI4,jtioau2,, hiau(C(*v)), !equ(C(*v),C(av[hj])), 1) //
boxed uniform type
+static IOFX(X, UI4,jtiox2,, hix(v), !eqx(n,v,av+n*hj), cn) //
extended integer
+static IOFX(Q, UI4,jtioq2,, hiq(v), !eqq(n,v,av+n*hj), cn) //
rational number
+static IOFX(C, UI4,jtioc2,, hic(k,(UC*)v), memcmpne(v,av+k*hj,k), cn) //
boolean, char, or integer
+static IOFX(I, UI4,jtioi2,COMPSETUP,hici(n,v),COMPCALL(av), cn) //
INT array, not float
+static IOFX(C2,UI4,jtioC22,, hici1((C2*)v), *v!=av[hj], 1) //
2-byte (char)
+static IOFX(C4,UI4,jtioC42,, hici1((C4*)v), *v!=av[hj], 1) //
4-byte (char)
+static IOFX(I, UI4,jtioi12,, hici1(v), *v!=av[hj], 1) //
len=8, not float
+static IOFX(D, UI4,jtioc012,,hic01((UIL*)v), *v!=av[hj], 1) //
float atom
+static IOFX(Z, UI4,jtioz012,,hic0(2,(UIL*)v),
(v[0].re!=av[hj].re)||(v[0].im!=av[hj].im), 1) // complex atom
+static IOFX(D, UI4,jtioc02,, hic0(n,(UIL*)v), fcmp0(v,&av[n*hj],n), cn) //
float array
+static IOFX(Z, UI4,jtioz02,,
hic0(2*n,(UIL*)v),fcmp0((D*)v,(D*)&av[n*hj],2*n),cn) // complex array
// ********************* second class: tolerant comparisons, possibly boxed
**********************
@@ -984,9 +989,10 @@ static
IOFT(A,UI4,jtioa12,cthia(ctmask,1.0,C(*v)),TFINDBX,TFINDBY,TFINDBYKEY,!eq
}
// The verbs to do the work, for different item lengths and hashtable sizes
-static IOFSMALLRANGE(jtio12,UC,US) static IOFSMALLRANGE(jtio14,UC,UI4) //
1-byte items, using small/large hashtable
-static IOFSMALLRANGE(jtio22,US,US) static IOFSMALLRANGE(jtio24,US,UI4) //
2-byte items, using small/large hashtable
-static IOFSMALLRANGE(jtio42,I,US) static IOFSMALLRANGE(jtio44,I,UI4) //
4/8-byte items, using small/large hashtable
+static IOFSMALLRANGE(jtio12,UC, US) static IOFSMALLRANGE(jtio14,UC, UI4) //
1-byte items, using small/large hashtable
+static IOFSMALLRANGE(jtio22,US, US) static IOFSMALLRANGE(jtio24,US, UI4) //
2-byte items, using small/large hashtable
+static IOFSMALLRANGE(jtio42,UI4,US) static IOFSMALLRANGE(jtio44,UI4,UI4) //
4-byte items, using small/large hashtable
+static IOFSMALLRANGE(jtio82,I, US) static IOFSMALLRANGE(jtio84,I, UI4) //
SZI-byte items, using small/large hashtable
// ******************* fourth class: sequential comparison
***************************************
// implemented only for i. i: e. u/. - perhaps should revert for other
compounds
@@ -1331,29 +1337,33 @@ static I jtutype(J jt,A w,I c){A*wv,x;I m,t;
R h;
\
}
-static IOFXW(A,US,jtiowax1,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(wv[hj])),1 )
/* boxed exact 1-element item */
-static IOFXW(A,US,jtiowau,, hiau(C(*v)), !equ(C(*v),C(wv[hj])),1 ) /* boxed
uniform type */
-static IOFXW(X,US,jtiowx,, hix(v), !eqx(n,v,wv+n*hj),
cn) /* extended integer */
-static IOFXW(Q,US,jtiowq,, hiq(v), !eqq(n,v,wv+n*hj),
cn) /* rational number */
-static IOFXW(C,US,jtiowc,, hic(k,(UC*)v), memcmpne(v,wv+k*hj,k),
cn) /* boolean, char, or integer */
-static IOFXW(I,US,jtiowi,COMPSETUP, hici(n,v), COMPCALL(wv),
cn ) // INT array, not float
-static IOFXW(I,US,jtiowi1,, hici1(v), *v!=wv[hj],
1 ) // len=8, not float
-static IOFXW(D,US,jtiowc01,, hic01((UIL*)v), *v!=wv[hj],
1) // float atom
-static IOFXW(Z,US,jtiowz01,, hic0(2,(UIL*)v),
(v[0].re!=wv[hj].re)||(v[0].im!=wv[hj].im), 1) // complex atom
-static IOFXW(D,US,jtiowc0,, hic0(n,(UIL*)v), fcmp0(v,&wv[n*hj],n),
cn) // float array
-static IOFXW(Z,US,jtiowz0,, hic0(2*n,(UIL*)v),
fcmp0((D*)v,(D*)&wv[n*hj],2*n), cn) // complex array
-
-static IOFXW(A,UI4,jtiowax12,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(wv[hj])),1 )
/* boxed exact 1-element item */
-static IOFXW(A,UI4,jtiowau2,, hiau(C(*v)), !equ(C(*v),C(wv[hj])),1 ) /*
boxed uniform type */
-static IOFXW(X,UI4,jtiowx2,, hix(v), !eqx(n,v,wv+n*hj),
cn) /* extended integer */
-static IOFXW(Q,UI4,jtiowq2,, hiq(v), !eqq(n,v,wv+n*hj),
cn) /* rational number */
-static IOFXW(C,UI4,jtiowc2,, hic(k,(UC*)v), memcmpne(v,wv+k*hj,k),
cn) /* boolean, char, or integer */
-static IOFXW(I,UI4,jtiowi2,COMPSETUP, hici(n,v), COMPCALL(wv),
cn ) // INT array, not float
-static IOFXW(I,UI4,jtiowi12,, hici1(v), *v!=wv[hj],
1 ) // len=8, not float
-static IOFXW(D,UI4,jtiowc012,, hic01((UIL*)v), *v!=wv[hj],
1) // float atom
-static IOFXW(Z,UI4,jtiowz012,, hic0(2,(UIL*)v),
(v[0].re!=wv[hj].re)||(v[0].im!=wv[hj].im), 1) // complex atom
-static IOFXW(D,UI4,jtiowc02,, hic0(n,(UIL*)v), fcmp0(v,&wv[n*hj],n),
cn) // float array
-static IOFXW(Z,UI4,jtiowz02,, hic0(2*n,(UIL*)v),
fcmp0((D*)v,(D*)&wv[n*hj],2*n), cn) // complex array
+static IOFXW(A, US,jtiowax1,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(wv[hj])), 1)
// boxed exact 1-element item
+static IOFXW(A, US,jtiowau,, hiau(C(*v)), !equ(C(*v),C(wv[hj])), 1)
// boxed uniform type
+static IOFXW(X, US,jtiowx,, hix(v), !eqx(n,v,wv+n*hj), cn)
// extended integer
+static IOFXW(Q, US,jtiowq,, hiq(v), !eqq(n,v,wv+n*hj), cn)
// rational number
+static IOFXW(C, US,jtiowc,, hic(k,(UC*)v), memcmpne(v,wv+k*hj,k), cn)
// boolean, char, or integer
+static IOFXW(I, US,jtiowi,COMPSETUP,hici(n,v),COMPCALL(wv), cn)
// INT array, not float
+static IOFXW(C2,US,jtiow21,, hici1((C2*)v), *v!=wv[hj], 1)
// 2-byte (char)
+static IOFXW(C4,US,jtiow41,, hici1((C4*)v), *v!=wv[hj], 1)
// 4-byte (char)
+static IOFXW(I, US,jtiowi1,, hici1(v), *v!=wv[hj], 1)
// len=8, not float
+static IOFXW(D, US,jtiowc01,,hic01((UIL*)v), *v!=wv[hj], 1)
// float atom
+static IOFXW(Z, US,jtiowz01,,hic0(2,(UIL*)v),
(v[0].re!=wv[hj].re)||(v[0].im!=wv[hj].im), 1) // complex atom
+static IOFXW(D, US,jtiowc0,, hic0(n,(UIL*)v), fcmp0(v,&wv[n*hj],n), cn)
// float array
+static IOFXW(Z, US,jtiowz0,,
hic0(2*n,(UIL*)v),fcmp0((D*)v,(D*)&wv[n*hj],2*n),cn) // complex array
+
+static IOFXW(A, UI4,jtiowax12,,cthia(~0LL,1.0,C(*v)),!equ(C(*v),C(wv[hj])), 1)
// boxed exact 1-element item
+static IOFXW(A, UI4,jtiowau2,, hiau(C(*v)), !equ(C(*v),C(wv[hj])), 1)
// boxed uniform type
+static IOFXW(X, UI4,jtiowx2,, hix(v), !eqx(n,v,wv+n*hj), cn)
// extended integer
+static IOFXW(Q, UI4,jtiowq2,, hiq(v), !eqq(n,v,wv+n*hj), cn)
// rational number
+static IOFXW(C, UI4,jtiowc2,, hic(k,(UC*)v), memcmpne(v,wv+k*hj,k), cn)
// boolean, char, or integer
+static IOFXW(I, UI4,jtiowi2,COMPSETUP,hici(n,v),COMPCALL(wv), cn)
// INT array, not float
+static IOFXW(C2,UI4,jtiow212,, hici1((C2*)v), *v!=wv[hj], 1)
// 2-byte (char)
+static IOFXW(C4,UI4,jtiow412,, hici1((C4*)v), *v!=wv[hj], 1)
// 4-byte (char)
+static IOFXW(I, UI4,jtiowi12,, hici1(v), *v!=wv[hj], 1)
// len=8, not float
+static IOFXW(D, UI4,jtiowc012,,hic01((UIL*)v), *v!=wv[hj], 1)
// float atom
+static IOFXW(Z, UI4,jtiowz012,,hic0(2,(UIL*)v),
(v[0].re!=wv[hj].re)||(v[0].im!=wv[hj].im), 1) // complex atom
+static IOFXW(D, UI4,jtiowc02,, hic0(n,(UIL*)v), fcmp0(v,&wv[n*hj],n),
cn) // float array
+static IOFXW(Z, UI4,jtiowz02,,
hic0(2*n,(UIL*)v),fcmp0((D*)v,(D*)&wv[n*hj],2*n), cn) // complex array
// *************************** seventh class: small-range processing of w
***********************
@@ -1707,6 +1717,7 @@ static CR condrange2(US *s,I n,I min,I max,I maxrange){CR
ret;I i;US x;
// jtioq RAT
// jtioi1 k==SZI, INT/SBT/char/bool not small-range
// jtioi list of >1 INT
+// jtioCk k-sized character
// jtioc k!=SZI, bool (must be list of em)/char/INT/SBT
// jtioc01 intolerant FL atom
// jtioc0 intolerant FL array
@@ -1723,7 +1734,7 @@ static CR condrange2(US *s,I n,I min,I max,I maxrange){CR
ret;I i;US x;
#define FNTBLSMALL1 12 // small-range, 1-byte items
#define FNTBLSMALL2 13 // small-range, 2-byte items
#define FNTBLSMALL4 14 // small-range, 4-byte items
-#define FNTBLONEINT 15 // hash of single INT-sized exact value
+#define FNTBLSMALLI 15 // small-range, SZI-byte items
#define FNTBLBOXARRAY 20 // array of boxes, tolerant or not (we just hash on
shape)
#define FNTBLBOXINTOLERANT 21 // single box but intolerant
#define FNTBLBOXUNIFORM 22 // single box, but a and w have uniform contents
@@ -1731,56 +1742,65 @@ static CR condrange2(US *s,I n,I min,I max,I
maxrange){CR ret;I i;US x;
#define FNTBLXNUM 24 // hashed xnum
#define FNTBLRAT 25 // hashed rat
#define FNTBLBOXSSORT 26 // boxes, handled by sorting and binary search
-#define FNTBLREVERSE 27 // where the reversed hashes start
-#define FNTBLSIZE 54 // number of functions - before the second half
+#define FNTBL2 27 // 2-byte (probably characters)
+#define FNTBL4 28 // 4-byte (probably characters)
+#define FNTBLI 29 // SZI-byte (will be duplicate of 4 on 32-bit, but who
cares)
+#define FNTBLREVERSE 30 // where the reversed hashes start
+#define FNTBLSIZE 60 // number of functions - before the second half
static const AF fntbl[]={
// prefix: routines used without hashtables, flags, etc
jtiosc, // sequential comparison (-2) - we pass in extra args
jtiosfu, // i.!.1 - sequential file update (-1)
// US tables
jtioc,jtioc,jtioc,jtioc,jtioi,jtioi,jtioi,jtioi, // bool, INT
- jtiod,jtioc0,jtiod1,jtioc01,jtio12,jtio22,jtio42,jtioi1, // FL (then
small-range, then ONEINT)
+ jtiod,jtioc0,jtiod1,jtioc01,jtio12,jtio22,jtio42,jtio82, // FL (then
small-range)
jtioz,jtioz0,jtioz1,jtioz01, // CMPX
jtioa,jtioax1,jtioau,jtioa1, // atomic types
jtiox,jtioq,
- jtiobs,
+ jtiobs,jtioC2,jtioC4,jtioi1,
jtiowc,jtiowc,jtiowc,jtiowc,jtiowi,jtiowi,jtiowi,jtiowi, // bool, INT
- 0,jtiowc0,0,jtiowc01,0,0,jtio42w,jtiowi1, // FL (then small-range, then
ONEINT)
+ 0,jtiowc0,0,jtiowc01,0,0,jtio42w,0, // FL (then small-range)
0,jtiowz0,0,jtiowz01, // CMPX
0,0,0,0,
0,0,
- 0,
+ 0,jtiow21,jtiow41,jtiowi1,
// UI4 tables
jtioc2,jtioc2,jtioc2,jtioc2,jtioi2,jtioi2,jtioi2,jtioi2, // bool, INT
- jtiod2,jtioc02,jtiod12,jtioc012,jtio14,jtio24,jtio44,jtioi12, // FL (then
small-range, then ONEINT)
+ jtiod2,jtioc02,jtiod12,jtioc012,jtio14,jtio24,jtio44,jtio84, // FL (then
small-range)
jtioz2,jtioz02,jtioz12,jtioz012, // CMPX
jtioa2,jtioax12,jtioau2,jtioa12, // atomic types
jtiox2,jtioq2,
- jtiobs,
+ jtiobs,jtioC22,jtioC42,jtioi12,
jtiowc2,jtiowc2,jtiowc2,jtiowc2,jtiowi2,jtiowi2,jtiowi2,jtiowi2, // bool, INT
- 0,jtiowc02,0,jtiowc012,0,0,jtio44w,jtiowi12, // FL (then small-range, then
ONEINT)
- 0,jtiowz02,0,jtiowz012 // CMPX
-
+ 0,jtiowc02,0,jtiowc012,0,0,jtio44w,0, // FL (then small-range)
+ 0,jtiowz02,0,jtiowz012, // CMPX
+ 0,0,0,0,
+ 0,0,
+ 0,jtiow212,jtiow412,jtiowi12,
};
static const S fnflags[]={ // 0 values reserved for small-range. They turn
off booladj
IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,
// bool, INT
- IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,0,0,0,IIMODFULL, // FL (then
small-range, then ONEINT)
+ IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL,0,0,0,0, // FL (then small-range)
IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL, // CMPX
IIMODFULL,IIMODFULL,IIMODFULL,IIMODFULL, // atomic types
IIMODFULL,IIMODFULL,
-2, // 'no hashing' (for box search)
+ IIMODFULL,IIMODFULL,IIMODFULL,
// Reversed hashes, where supported. IIMODFULL is not needed by the
reversed-hash code so we continue its use, started above, as a flag to turn off
booleans
IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,IREVERSED|IIMODFULL,
// bool, INT
-
IIMODFULL,IREVERSED|IIMODFULL,IIMODFULL,IREVERSED|IIMODFULL,IREVERSED,IREVERSED,IREVERSED,IIMODFULL,
// FL (then small-range, then ONEINT)
- IIMODFULL,IREVERSED|IIMODFULL,IIMODFULL,IREVERSED|IIMODFULL // CMPX
-
+
IIMODFULL,IREVERSED|IIMODFULL,IIMODFULL,IREVERSED|IIMODFULL,IREVERSED,IREVERSED,IREVERSED,IREVERSED,
// FL (then small-range)
+ IIMODFULL,IREVERSED|IIMODFULL,IIMODFULL,IREVERSED|IIMODFULL, // CMPX
+ 0,0,0,0,
+ 0,0,
+ 0,
+ IIMODFULL,IIMODFULL,IIMODFULL,
};
#define MAXBYTEBOOL 65536 // if p exceeds this, we switch over to packed bits
@@ -1959,23 +1979,28 @@ A jtindexofsub(J jt,I mode,A a,A
w){F2PREFIP;PROLOG(0079);A h=0;fauxblockINT(zfa
// result is p (the length of hashtable, as # of entries), datamin (the
minimum value found, if small-range)
// If the allocated range includes all the possible values for the input,
set IIMODFULL to indicate that fact
if(unlikely(2==k)){
- // if the actual range of the data exceeds p, we revert to hashing. All
2-byte types are exact
- CR crres =
condrange2(USAV(a),(AN(a)<<klg)>>LGSZS,-1,0,MIN((UI)(IMAX-5)>>booladj,3*m)<<booladj);
// get the range
- if(crres.range){
- datamin=crres.min;
- // If the range is close to the max, we should consider widening the
range to use the faster FULL code. We do this only for boolean hashes, because
- // in the current allocation going all the way to 65536 kicks us into
the longer hashtable (questionable decision). Otherwise we should just promote
- // any non-Boolean, because the actual cache footprint won't change.
- // The cost of promoting a Boolean is 1 store (1 clock) per word
cleared, for (65536-range)>>booladj bytes (if booladj!=0) [or (65536-range)
hashtable entries if booladj==0]
- // The savings is 4 ops (2 clocks) per word searched
- if(booladj && ((UI)(65536-crres.range)>>booladj) <
(c<<(LGSZI+1))){p=65536; datamin=0;}else{p=crres.range;} // this
underestimates the benefit for prehashes
- if(p==65536)mode|=IIMODFULL;
- fnx=FNTBLSMALL2; // This qualifies for small-range processing
+ if(3*m>=65536>>booladj){datamin=0; p=65536; mode|=IIMODFULL;
fnx=FNTBLSMALL2;} // will always qualify for small-range, so don't bother
checking range
+ else if(t!=C2T){fnx=FNTBL2;} // 2-col matrix; range-checking likely
unproductive
+ else{
+ // if the actual range of the data exceeds p, we revert to hashing. All
2-byte types are exact
+ CR crres =
condrange2(USAV(a),(AN(a)<<klg)>>LGSZS,-1,0,MIN((UI)(IMAX-5)>>booladj,3*m)<<booladj);
// get the range
+ if(!crres.range){fnx=FNTBL2;}
+ else{
+ datamin=crres.min;
+ // If the range is close to the max, we should consider widening the
range to use the faster FULL code. We do this only for boolean hashes, because
+ // in the current allocation going all the way to 65536 kicks us into
the longer hashtable (questionable decision). Otherwise we should just promote
+ // any non-Boolean, because the actual cache footprint won't change.
+ // The cost of promoting a Boolean is 1 store (1 clock) per word
cleared, for (65536-range)>>booladj bytes (if booladj!=0) [or (65536-range)
hashtable entries if booladj==0]
+ // The savings is 4 ops (2 clocks) per word searched
+ if(booladj && ((UI)(65536-crres.range)>>booladj) <
(c<<(LGSZI+1))){p=65536; datamin=0;}else{p=crres.range;} // this
underestimates the benefit for prehashes
+ if(p==65536)mode|=IIMODFULL;
+ fnx=FNTBLSMALL2; // This qualifies for small-range processing
+ }
}
}
if(likely(fnx<0)){ // if we don't have it yet, it will be a hash or
small-range integers. Decide which one
- if((k&~(t&FL))==SZI){ // non-float, might be INT or SBT, or characters.
FL has -0 problem requires SZI==FL
- if(likely((t&INT+SBT)!=0)){I fnprov;A rangearg; UI rangearglen; // same
here, for I types
+ if(((k&~(t&FL))==SZI)|(4==k)){ // non-float, might be INT or SBT, or
characters. FL has -0 problem requires SZI==FL
+ if(likely((t&INT+SBT+(4==k?C4T:0))!=0)){I fnprov;A rangearg; UI
rangearglen; // same here, for I types
// small-range processing is a possibility, but we need to decide
whether we are going to do a reversed hash, so we will
// know which range to check. For i./i:, we reverse if c is much
shorter than m; for e., we have to consider whether
// the forward hash will benefit from bits mode, so we have to estimate
the size of each hash table
@@ -1983,9 +2008,9 @@ A jtindexofsub(J jt,I mode,A a,A
w){F2PREFIP;PROLOG(0079);A h=0;fauxblockINT(zfa
// otherwise (a candidate for reversed hash), if i./i: is set, meaning a
full hashtable is needed, reverse if a is twice as long as w
// otherwise (e., which uses bitmasks in the forward hash) calculate
length of bitmask and reverse if the full table for w is shorter
// than the bitmask for a. Note that FORKEY will never cause a reversed
hash
- rangearg=a; rangearglen=m; fnprov=FNTBLSMALL4; // values for forward
check
+ rangearg=a; rangearglen=m; fnprov=FNTBLSMALL4+(k==SZI); // values for
forward check
if(mode&IIOREPS){ // if reverse check is possible, see if it is desired
- if((m>>(1))>c){rangearg=w; rangearglen=c;
fnprov=FNTBLSMALL4+FNTBLREVERSE; } // booladj?(m>MAXBYTEBOOL?5:2): omitted now
+ if((m>>(1))>c){rangearg=w; rangearglen=c; fnprov+=FNTBLREVERSE; } //
booladj?(m>MAXBYTEBOOL?5:2): omitted now
}
// we make the small-range decision mostly on length; if the range table
would be bigger than the hashtable, we use the hash. Here
// we invert the calculation to see how big a range we can tolerate
without exceeding the table size. The length of the hash, whether small-range
@@ -1994,10 +2019,11 @@ A jtindexofsub(J jt,I mode,A a,A
w){F2PREFIP;PROLOG(0079);A h=0;fauxblockINT(zfa
// small-range hash. The full hash spends more time in lookup than in
creation, because misses become more likely the
// fuller the table. This makes small-range much more valuable when the
hashes are repeated
I maxsizemult=mode&IPHCALC?6:4; // # slots/item to allow in small-range
table. More if prehash
- CR crres =
condrange(AV(rangearg),((AN(rangearg)<<klg))>>LGSZI,IMAX,IMIN,MIN((UI)(IMAX-5)>>booladj,maxsizemult*rangearglen)<<booladj);
+ CR crres = k==SZI?condrange
(AV(rangearg),((AN(rangearg)<<klg))>>LGSZI,
IMAX,IMIN,MIN((UI)(IMAX-5)>>booladj,maxsizemult*rangearglen)<<booladj)
+
:condrange4(C4AV(rangearg),((AN(rangearg)<<klg))>>LGSZUI4,IMAX,IMIN,MIN((UI)(IMAX-5)>>booladj,maxsizemult*rangearglen)<<booladj);
if(crres.range){datamin=crres.min; p=crres.range; fnx=fnprov; // use
the selected orientation
- }else{fnx=FNTBLONEINT;} // select integer hashing if range too big...
- }else{fnx=FNTBLONEINT;} // ... or some other 8-byte length (not float,
though)
+ }else{fnx=FNTBL4+(k==SZI);} // select integer hashing if range too
big...
+ }else{ fnx=FNTBL4+(k==SZI);} // ... or some other 4/8-byte length (not
float, though)
}else{ // it's a hash
fnx=((t&CMPX+FL+INT))+((n==1)?2:0)+fnx+2; // index:
CMPX/FL/n==1/intolerant (~fnx is 1 for tolerant, 0 for intolerant; fnx+2 is the
reverse)
}
----------------------------------------------------------------------
For information about J forums see http://www.jsoftware.com/forums.htm