Changeset: f7b8a224e3ed for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f7b8a224e3ed Modified Files: gdk/gdk_join.c Branch: default Log Message:
Use hash on parent bat in join. This includes using persistent bats on parents and using already existing but not yet loaded hashes. diffs (235 lines): diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -1257,6 +1257,21 @@ binsearchcand(const oid *cand, BUN lo, B nr++; \ } while (0) +#define HASHloop_bound(bi, h, hb, v, lo, hi) \ + for (hb = HASHget(h, HASHprobe((h), v)); \ + hb != HASHnil(h); \ + hb = HASHgetlink(h,hb)) \ + if (hb >= (lo) && hb < (hi) && \ + (cmp == NULL || \ + (*cmp)(v, BUNtail(bi, hb)) == 0)) + +#define HASHloop_bound_TYPE(bi, h, hb, v, lo, hi, TYPE) \ + for (hb = HASHget(h, hash_##TYPE(h, v)); \ + hb != HASHnil(h); \ + hb = HASHgetlink(h,hb)) \ + if (hb >= (lo) && hb < (hi) && \ + simple_EQ(v, BUNtloc(bi, hb), TYPE)) + static gdk_return hashjoin(BAT *r1, BAT *r2, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, int nil_on_miss, int semi, int must_match) { @@ -1266,8 +1281,9 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * const oid *rcand = NULL, *rcandend = NULL; oid lo, ro; BATiter ri; - BUN rb, rb0; - wrd rbun2oid; + BUN rb; + BUN rl, rh; + oid rseq; BUN nr, nrcand, newcap; const char *lvals; const char *lvars; @@ -1317,7 +1333,7 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * } /* offset to convert BUN for value in right tail column to OID * in right head column */ - rbun2oid = (wrd) r->hseqbase - (wrd) BUNfirst(r); + rseq = r->hseqbase; /* basic properties will be adjusted if necessary later on, * they were initially set by joininitresults() */ @@ -1349,6 +1365,17 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * return GDK_SUCCEED; } + if (VIEWtparent(r)) { + BAT *b = BBPdescriptor(-VIEWtparent(r)); + rl = (BUN) ((r->T->heap.base - b->T->heap.base) >> r->T->shift) + BUNfirst(r); + r = b; + } else { + rl = BUNfirst(r); + } + rh = rl + rend; + rl += rstart; + rseq += rstart; + if (BAThash(r, 0) == GDK_FAIL) goto bailout; ri = bat_iterator(r); @@ -1369,8 +1396,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * } nr = 0; if (rcand) { - HASHloop(ri, r->T->hash, rb, v) { - ro = (oid) (rb + rbun2oid); + HASHloop_bound(ri, r->T->hash, rb, v, rl, rh) { + ro = (oid) (rb - rl + rseq); if (!binsearchcand(rcand, 0, nrcand, ro)) continue; HASHLOOPBODY(); @@ -1378,11 +1405,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * break; } } else { - HASHloop(ri, r->T->hash, rb, v) { - rb0 = rb - BUNfirst(r); /* zero-based */ - if (rb0 < rstart || rb0 >= rend) - continue; - ro = (oid) (rb + rbun2oid); + HASHloop_bound(ri, r->T->hash, rb, v, rl, rh) { + ro = (oid) (rb - rl + rseq); HASHLOOPBODY(); if (semi) break; @@ -1445,8 +1469,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * lskipped = BATcount(r1) > 0; continue; } - HASHloop(ri, r->T->hash, rb, v) { - ro = (oid) (rb + rbun2oid); + HASHloop_bound(ri, r->T->hash, rb, v, rl, rh) { + ro = (oid) (rb - rl + rseq); if (!binsearchcand(rcand, 0, nrcand, ro)) continue; HASHLOOPBODY(); @@ -1460,11 +1484,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * lskipped = BATcount(r1) > 0; continue; } - HASHloop_int(ri, r->T->hash, rb, v) { - rb0 = rb - BUNfirst(r); /* zero-based */ - if (rb0 < rstart || rb0 >= rend) - continue; - ro = (oid) (rb + rbun2oid); + HASHloop_bound_TYPE(ri, r->T->hash, rb, v, rl, rh, int) { + ro = (oid) (rb - rl + rseq); HASHLOOPBODY(); if (semi) break; @@ -1475,11 +1496,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * lskipped = BATcount(r1) > 0; continue; } - HASHloop_lng(ri, r->T->hash, rb, v) { - rb0 = rb - BUNfirst(r); /* zero-based */ - if (rb0 < rstart || rb0 >= rend) - continue; - ro = (oid) (rb + rbun2oid); + HASHloop_bound_TYPE(ri, r->T->hash, rb, v, rl, rh, lng) { + ro = (oid) (rb - rl + rseq); HASHLOOPBODY(); if (semi) break; @@ -1491,11 +1509,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * lskipped = BATcount(r1) > 0; continue; } - HASHloop_hge(ri, r->T->hash, rb, v) { - rb0 = rb - BUNfirst(r); /* zero-based */ - if (rb0 < rstart || rb0 >= rend) - continue; - ro = (oid) (rb + rbun2oid); + HASHloop_bound_TYPE(ri, r->T->hash, rb, v, rl, rh, hge) { + ro = (oid) (rb - rl + rseq); HASHLOOPBODY(); if (semi) break; @@ -1507,11 +1522,8 @@ hashjoin(BAT *r1, BAT *r2, BAT *l, BAT * lskipped = BATcount(r1) > 0; continue; } - HASHloop(ri, r->T->hash, rb, v) { - rb0 = rb - BUNfirst(r); /* zero-based */ - if (rb0 < rstart || rb0 >= rend) - continue; - ro = (oid) (rb + rbun2oid); + HASHloop_bound(ri, r->T->hash, rb, v, rl, rh) { + ro = (oid) (rb - rl + rseq); HASHLOOPBODY(); if (semi) break; @@ -2374,8 +2386,10 @@ gdk_return BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, BAT *r, BAT *sl, BAT *sr, int nil_matches, BUN estimate) { BAT *r1, *r2; - BUN lcount, rcount; + BUN lcount, rcount, lpcount, rpcount; BUN lsize, rsize; + int lhash, rhash; + bat lparent, rparent; int swap; size_t mem_size; @@ -2416,19 +2430,35 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, rsize = (BUN) (BATcount(r) * (Tsize(r) + (r->T->vheap ? r->T->vheap->size : 0) + 2 * sizeof(BUN))); mem_size = GDK_mem_maxsize / (GDKnr_threads ? GDKnr_threads : 1); + lparent = VIEWtparent(l); + rparent = VIEWtparent(r); + if (lparent) { + lpcount = BATcount(BBPdescriptor(lparent)); + lhash = BATcheckhash(l) || BATcheckhash(BBPdescriptor(-lparent)); + } else { + lpcount = BATcount(l); + lhash = BATcheckhash(l); + } + if (rparent) { + rpcount = BATcount(BBPdescriptor(rparent)); + rhash = BATcheckhash(r) || BATcheckhash(BBPdescriptor(-rparent)); + } else { + rpcount = BATcount(r); + rhash = BATcheckhash(r); + } if ((l->tsorted || l->trevsorted) && (r->tsorted || r->trevsorted)) { /* both sorted, smallest on left */ if (BATcount(l) <= BATcount(r)) return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0); else return mergejoin(r2, r1, r, l, sr, sl, nil_matches, 0, 0, 0); - } else if (l->T->hash && r->T->hash) { + } else if (lhash && rhash) { /* both have hash, smallest on right */ swap = lcount < rcount; - } else if (l->T->hash) { + } else if (lhash) { /* only left has hash, swap */ swap = 1; - } else if (r->T->hash) { + } else if (rhash) { /* only right has hash, don't swap */ swap = 0; } else if ((l->tsorted || l->trevsorted) && @@ -2445,7 +2475,26 @@ BATsubjoin(BAT **r1p, BAT **r2p, BAT *l, * large (i.e. prefer hash over binary search, but * only if the hash table doesn't cause thrashing) */ return mergejoin(r1, r2, l, r, sl, sr, nil_matches, 0, 0, 0); - } else if (BATcount(l) < BATcount(r)) { + } else if ((l->batPersistence == PERSISTENT || + (lparent != 0 && + BBPquickdesc(abs(lparent), 0)->batPersistence == PERSISTENT)) && + !(r->batPersistence == PERSISTENT || + (rparent != 0 && + BBPquickdesc(abs(rparent), 0)->batPersistence == PERSISTENT))) { + /* l (or its parent) is persistent and r is not, + * create hash on l since it may be reused */ + swap = 1; + } else if (!(l->batPersistence == PERSISTENT || + (lparent != 0 && + BBPquickdesc(abs(lparent), 0)->batPersistence == PERSISTENT)) && + (r->batPersistence == PERSISTENT || + (rparent != 0 && + BBPquickdesc(abs(rparent), 0)->batPersistence == PERSISTENT))) { + /* l (and its parent) is not persistent but r (or its + * parent) is, create hash on r since it may be + * reused */ + /* nothing */; + } else if (lpcount < rpcount) { /* no hashes, not sorted, create hash on smallest BAT */ swap = 1; } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list