Changeset: fe2c437da6ce for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fe2c437da6ce Modified Files: gdk/gdk_join.c Branch: default Log Message:
Efficiently handle sequences of non-matching values in outer merge join. diffs (168 lines): diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -1070,7 +1070,11 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT if (sl) r1->tdense = sl->tdense; while (lcand ? lcand < lcandend : lstart < lend) { - if (!nil_on_miss && !must_match && lscan > 0) { + if (lscan == 0) { + /* always search r completely */ + rcand = rcandorig; + rstart = rstartorig; + } else { /* If l is sorted (lscan > 0), we look at the * next value in r to see whether we can jump * over a large section of l using binary @@ -1081,21 +1085,21 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT * If it is, we use binary search on l, * otherwise we scan l for the next position * with a value greater than or equal to the - * value in r. Obviously, we can only do this - * if we're not inserting NILs for missing - * values in r (nil_on_miss set, i.e., outer - * join). + * value in r. * The next value to match in r is the first * if equal_order is set, the last * otherwise. */ + BUN nlx = 0; /* number of non-matching values in l */ + if (rcand) { if (rcand == rcandend) - break; - v = VALUE(r, (equal_order ? rcand[0] : rcandend[-1]) - r->hseqbase); + v = NULL; + else + v = VALUE(r, (equal_order ? rcand[0] : rcandend[-1]) - r->hseqbase); } else { - if (rstart == rend) - break; - if (rvals) { + if (rstart == rend) { + v = NULL; + } else if (rvals) { v = VALUE(r, equal_order ? rstart : rend - 1); if (roff == wrd_nil) { rval = oid_nil; @@ -1114,33 +1118,43 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT v = (const char *) &rval; } } - /* here, v points to next value in r */ - if (lcand) { + /* here, v points to next value in r, or if + * we're at the end of r, v is NULL */ + if (v == NULL) { + if (lcand) { + nlx = lcandend - lcand; + lcand = lcandend; + } else { + nlx = lend - lstart; + lstart = lend; + } + } else if (lcand) { if (lscan < (BUN) (lcandend - lcand) && lordering * cmp(VALUE(l, lcand[lscan] - l->hseqbase), v) < 0) { - lcand += binsearch(lcand, l->hseqbase, - l->ttype, lvals, lvars, - lwidth, lscan, - (BUN) (lcandend - lcand), v, - cmp, lordering, 0); + nlx = binsearch(lcand, l->hseqbase, + l->ttype, lvals, lvars, + lwidth, lscan, + (BUN) (lcandend - lcand), v, + cmp, lordering, 0); + lcand += nlx; if (lcand == lcandend) - break; - lskipped = BATcount(r1) > 0; + v = NULL; } } else if (lvals) { if (lscan < lend - lstart && lordering * cmp(VALUE(l, lstart + lscan), v) < 0) { + nlx = lstart; lstart = binsearch(NULL, 0, l->ttype, lvals, lvars, lwidth, lstart + lscan, lend, v, cmp, lordering, 0); + nlx = lstart - nlx; if (lstart == lend) - break; - lskipped = BATcount(r1) > 0; + v = NULL; } } else if (*(const oid *)v != oid_nil) { if (l->tseqbase == oid_nil) { @@ -1150,22 +1164,53 @@ mergejoin(BAT *r1, BAT *r2, BAT *l, BAT * all other values in r are * also not nil, and all * values in l are nil */ + nlx = lend - lstart; lstart = lend; - break; - } - if (*(const oid *)v > l->tseqbase) { - BUN olstart = lstart; + v = NULL; + } else if (*(const oid *)v > l->tseqbase) { + nlx = lstart; lstart = *(const oid *)v - l->tseqbase; - if (lstart >= lend) - break; - if (lstart > olstart) - lskipped = BATcount(r1) > 0; + if (lstart >= lend) { + lstart = lend; + v = NULL; + } + nlx = lstart - nlx; } } - } else if (lscan == 0) { - /* always search r completely */ - rcand = rcandorig; - rstart = rstartorig; + if (nlx > 0) { + if (must_match) { + GDKerror("mergejoin(%s,%s) does not hit always => can't use fetchjoin.\n", BATgetId(l), BATgetId(r)); + goto bailout; + } + if (nil_on_miss) { + if (r2->T->nonil) { + r2->T->nil = 1; + r2->T->nonil = 0; + r2->tdense = 0; + r2->tsorted = 0; + r2->trevsorted = 0; + } + if (lcand) { + while (nlx > 0) { + APPEND(r1, lcand[-nlx]); + APPEND(r2, oid_nil); + nlx--; + } + } else { + while (nlx > 0) { + APPEND(r1, lstart - nlx); + APPEND(r2, oid_nil); + nlx--; + } + } + } else { + lskipped = BATcount(r1) > 0; + } + } + if (v == NULL) { + /* we have exhausted the inputs */ + break; + } } /* Here we determine the next value in l that we are * going to try to match in r. We will also count the _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list