Changeset: bb0575cfe1f2 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bb0575cfe1f2 Modified Files: gdk/gdk.h gdk/gdk_batop.c gdk/gdk_join.c gdk/gdk_project.c Branch: Oct2020 Log Message:
Merged with Jun2020 diffs (truncated from 360 to 300 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2023,6 +2023,7 @@ enum prop_t { GDK_MAX_VALUE, /* largest non-nil value in BAT */ GDK_HASH_BUCKETS, /* last used hash bucket size */ GDK_NUNIQUE, /* number of unique values */ + GDK_UNIQUE_ESTIMATE, /* estimate of number of distinct values */ }; /* diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -1073,6 +1073,7 @@ BUNappend(BAT *b, const void *t, bool fo IMPSdestroy(b); /* no support for inserts in imprints yet */ OIDXdestroy(b); BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ PROPrec *prop; do { @@ -1159,6 +1160,7 @@ BUNdelete(BAT *b, oid o) OIDXdestroy(b); HASHdestroy(b); BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ do { for (prop = b->tprops; prop; prop = prop->next) @@ -1248,6 +1250,7 @@ BUNinplace(BAT *b, BUN p, const void *t, } } BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ do { for (prop = b->tprops; prop; prop = prop->next) diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -573,6 +573,7 @@ BATappend2(BAT *b, BAT *n, BAT *s, bool } } BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); #if 0 /* enable if we have more properties than just min/max */ do { for (prop = b->tprops; prop; prop = prop->next) @@ -890,6 +891,7 @@ BATreplace(BAT *b, BAT *p, BAT *n, bool OIDXdestroy(b); IMPSdestroy(b); BATrmprop(b, GDK_NUNIQUE); + BATrmprop(b, GDK_UNIQUE_ESTIMATE); b->tsorted = b->trevsorted = false; b->tnosorted = b->tnorevsorted = 0; diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c --- a/gdk/gdk_join.c +++ b/gdk/gdk_join.c @@ -3041,12 +3041,19 @@ guess_uniques(BAT *b, struct canditer *c if (b->tkey) return (double) ci->ncand; - if (ci->s) { + if (ci->s == NULL || + (ci->tpe == cand_dense && ci->ncand == BATcount(b))) { + PROPrec *p = BATgetprop(b, GDK_UNIQUE_ESTIMATE); + if (p) { + TRC_DEBUG(ALGO, "b=" ALGOBATFMT " use cached value\n", + ALGOBATPAR(b)); + return p->v.val.dval; + } + s1 = BATsample(b, 1000); + } else { BAT *s2 = BATsample(ci->s, 1000); s1 = BATproject(s2, ci->s); BBPreclaim(s2); - } else { - s1 = BATsample(b, 1000); } BUN n2 = BATcount(s1); BUN n1 = n2 / 2; @@ -3056,7 +3063,12 @@ guess_uniques(BAT *b, struct canditer *c double A = (double) (cnt2 - cnt1) / (n2 - n1); double B = cnt1 - n1 * A; - return A * ci->ncand + B; + B += A * ci->ncand; + if (ci->s == NULL || + (ci->tpe == cand_dense && ci->ncand == BATcount(b))) { + BATsetprop(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, &B); + } + return B; } #define MASK_EQ 1 diff --git a/gdk/gdk_project.c b/gdk/gdk_project.c --- a/gdk/gdk_project.c +++ b/gdk/gdk_project.c @@ -290,6 +290,211 @@ project_any(BAT *restrict bn, BAT *restr return GDK_SUCCEED; } +static BAT * +project_str(BAT *restrict l, struct canditer *restrict ci, + BAT *restrict r1, BAT *restrict r2, lng t0) +{ + BAT *bn; + BUN lo, hi; + oid r1seq, r1end; + oid r2seq, r2end; + BUN h1off; + BAT *r; + BUN off; + oid seq; + var_t v; + + if ((bn = COLnew(l->hseqbase, TYPE_str, ci ? ci->ncand : BATcount(l), + TRANSIENT)) != NULL) + return NULL; + + v = (var_t) r1->tvheap->free; + if (r1->tvheap == r2->tvheap) { + h1off = 0; + BBPshare(bn->tvheap->parentid); + HEAPfree(bn->tvheap, true); + GDKfree(bn->tvheap); + bn->tvheap = r1->tvheap; + } else { + v = (v + GDK_VARALIGN - 1) & ~(GDK_VARALIGN - 1); + h1off = (BUN) v; + v += ((var_t) r2->tvheap->free + GDK_VARALIGN - 1) & ~(GDK_VARALIGN - 1); + if (HEAPextend(bn->tvheap, v, false) != GDK_SUCCEED) { + BBPreclaim(bn); + return NULL; + } + memcpy(bn->tvheap->base, r1->tvheap->base, r1->tvheap->free); +#ifndef NDEBUG + if (h1off > r1->tvheap->free) + memset(bn->tvheap->base + r1->tvheap->free, 0, h1off - r1->tvheap->free); +#endif + memcpy(bn->tvheap->base + h1off, r2->tvheap->base, r2->tvheap->free); + } + + if (v >= ((var_t) 1 << (8 * bn->twidth)) && + GDKupgradevarheap(bn, v, false, false) != GDK_SUCCEED) { + BBPreclaim(bn); + return NULL; + } + + r1seq = r1->hseqbase; + r1end = r1seq + BATcount(r1); + r2seq = r2->hseqbase; + r2end = r2seq + BATcount(r2); + if (ci) { + for (lo = 0, hi = ci->ncand; lo < hi; lo++) { + oid o = canditer_next(ci); + if (o < r1seq || o >= r2end) { + GDKerror("does not match always\n"); + return GDK_FAIL; + } + if (o < r1end) { + r = r1; + off = 0; + seq = r1seq; + } else { + r = r2; + off = h1off; + seq = r2seq; + } + switch (r->twidth) { + case 1: + v = (var_t) ((uint8_t *) r->theap.base)[o - seq] + GDK_VAROFFSET; + break; + case 2: + v = (var_t) ((uint16_t *) r->theap.base)[o - seq] + GDK_VAROFFSET; + break; + case 4: + v = (var_t) ((uint32_t *) r->theap.base)[o - seq]; + break; + case 8: + v = (var_t) ((uint64_t *) r->theap.base)[o - seq]; + break; + } + v += off; + switch (bn->twidth) { + case 1: + ((uint8_t *) bn->theap.base)[lo] = v - GDK_VAROFFSET; + break; + case 2: + ((uint16_t *) bn->theap.base)[lo] = v - GDK_VAROFFSET; + break; + case 4: + ((uint32_t *) bn->theap.base)[lo] = (uint32_t) v; + break; + case 8: + ((uint64_t *) bn->theap.base)[lo] = (uint64_t) v; + break; + } + } + } else if (BATtdense(l)) { + for (lo = 0, hi = BATcount(l); lo < hi; lo++) { + oid o = l->tseqbase + lo; + if (o < r1seq || o >= r2end) { + GDKerror("does not match always\n"); + return GDK_FAIL; + } + if (o < r1end) { + r = r1; + off = 0; + seq = r1seq; + } else { + r = r2; + off = h1off; + seq = r2seq; + } + switch (r->twidth) { + case 1: + v = (var_t) ((uint8_t *) r->theap.base)[o - seq] + GDK_VAROFFSET; + break; + case 2: + v = (var_t) ((uint16_t *) r->theap.base)[o - seq] + GDK_VAROFFSET; + break; + case 4: + v = (var_t) ((uint32_t *) r->theap.base)[o - seq]; + break; + case 8: + v = (var_t) ((uint64_t *) r->theap.base)[o - seq]; + break; + } + v += off; + switch (bn->twidth) { + case 1: + ((uint8_t *) bn->theap.base)[lo] = v - GDK_VAROFFSET; + break; + case 2: + ((uint16_t *) bn->theap.base)[lo] = v - GDK_VAROFFSET; + break; + case 4: + ((uint32_t *) bn->theap.base)[lo] = (uint32_t) v; + break; + case 8: + ((uint64_t *) bn->theap.base)[lo] = (uint64_t) v; + break; + } + } + } else { + const oid *restrict ot = (const oid *) Tloc(l, 0); + for (lo = 0, hi = BATcount(l); lo < hi; lo++) { + oid o = ot[lo]; + if (o < r1seq || o >= r2end) { + GDKerror("does not match always\n"); + return GDK_FAIL; + } + if (o < r1end) { + r = r1; + off = 0; + seq = r1seq; + } else { + r = r2; + off = h1off; + seq = r2seq; + } + switch (r->twidth) { + case 1: + v = (var_t) ((uint8_t *) r->theap.base)[o - seq] + GDK_VAROFFSET; + break; + case 2: + v = (var_t) ((uint16_t *) r->theap.base)[o - seq] + GDK_VAROFFSET; + break; + case 4: + v = (var_t) ((uint32_t *) r->theap.base)[o - seq]; + break; + case 8: + v = (var_t) ((uint64_t *) r->theap.base)[o - seq]; + break; + } + v += off; + switch (bn->twidth) { + case 1: + ((uint8_t *) bn->theap.base)[lo] = v - GDK_VAROFFSET; + break; + case 2: + ((uint16_t *) bn->theap.base)[lo] = v - GDK_VAROFFSET; + break; + case 4: + ((uint32_t *) bn->theap.base)[lo] = (uint32_t) v; + break; + case 8: + ((uint64_t *) bn->theap.base)[lo] = (uint64_t) v; + break; + } + } + } + BATsetcount(bn, lo); + bn->tsorted = bn->trevsorted = false; + bn->tnil = false; + bn->tnonil = r1->tnonil & r2->tnonil; + bn->tkey = false; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list