Changeset: bb0575cfe1f2 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bb0575cfe1f2
Modified Files:
        gdk/gdk.h
        gdk/gdk_batop.c
        gdk/gdk_join.c
        gdk/gdk_project.c
Branch: Oct2020
Log Message:

Merged with Jun2020


diffs (truncated from 360 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2023,6 +2023,7 @@ enum prop_t {
        GDK_MAX_VALUE,          /* largest non-nil value in BAT */
        GDK_HASH_BUCKETS,       /* last used hash bucket size */
        GDK_NUNIQUE,            /* number of unique values */
+       GDK_UNIQUE_ESTIMATE,    /* estimate of number of distinct values */
 };
 
 /*
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -1073,6 +1073,7 @@ BUNappend(BAT *b, const void *t, bool fo
        IMPSdestroy(b); /* no support for inserts in imprints yet */
        OIDXdestroy(b);
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
        PROPrec *prop;
        do {
@@ -1159,6 +1160,7 @@ BUNdelete(BAT *b, oid o)
        OIDXdestroy(b);
        HASHdestroy(b);
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
        do {
                for (prop = b->tprops; prop; prop = prop->next)
@@ -1248,6 +1250,7 @@ BUNinplace(BAT *b, BUN p, const void *t,
                        }
                }
                BATrmprop(b, GDK_NUNIQUE);
+               BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
                do {
                        for (prop = b->tprops; prop; prop = prop->next)
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -573,6 +573,7 @@ BATappend2(BAT *b, BAT *n, BAT *s, bool 
                }
        }
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 #if 0          /* enable if we have more properties than just min/max */
        do {
                for (prop = b->tprops; prop; prop = prop->next)
@@ -890,6 +891,7 @@ BATreplace(BAT *b, BAT *p, BAT *n, bool 
        OIDXdestroy(b);
        IMPSdestroy(b);
        BATrmprop(b, GDK_NUNIQUE);
+       BATrmprop(b, GDK_UNIQUE_ESTIMATE);
 
        b->tsorted = b->trevsorted = false;
        b->tnosorted = b->tnorevsorted = 0;
diff --git a/gdk/gdk_join.c b/gdk/gdk_join.c
--- a/gdk/gdk_join.c
+++ b/gdk/gdk_join.c
@@ -3041,12 +3041,19 @@ guess_uniques(BAT *b, struct canditer *c
        if (b->tkey)
                return (double) ci->ncand;
 
-       if (ci->s) {
+       if (ci->s == NULL ||
+           (ci->tpe == cand_dense && ci->ncand == BATcount(b))) {
+               PROPrec *p = BATgetprop(b, GDK_UNIQUE_ESTIMATE);
+               if (p) {
+                       TRC_DEBUG(ALGO, "b=" ALGOBATFMT " use cached value\n",
+                                 ALGOBATPAR(b));
+                       return p->v.val.dval;
+               }
+               s1 = BATsample(b, 1000);
+       } else {
                BAT *s2 = BATsample(ci->s, 1000);
                s1 = BATproject(s2, ci->s);
                BBPreclaim(s2);
-       } else {
-               s1 = BATsample(b, 1000);
        }
        BUN n2 = BATcount(s1);
        BUN n1 = n2 / 2;
@@ -3056,7 +3063,12 @@ guess_uniques(BAT *b, struct canditer *c
        double A = (double) (cnt2 - cnt1) / (n2 - n1);
        double B = cnt1 - n1 * A;
 
-       return A * ci->ncand + B;
+       B += A * ci->ncand;
+       if (ci->s == NULL ||
+           (ci->tpe == cand_dense && ci->ncand == BATcount(b))) {
+               BATsetprop(b, GDK_UNIQUE_ESTIMATE, TYPE_dbl, &B);
+       }
+       return B;
 }
 
 #define MASK_EQ                1
diff --git a/gdk/gdk_project.c b/gdk/gdk_project.c
--- a/gdk/gdk_project.c
+++ b/gdk/gdk_project.c
@@ -290,6 +290,211 @@ project_any(BAT *restrict bn, BAT *restr
        return GDK_SUCCEED;
 }
 
+static BAT *
+project_str(BAT *restrict l, struct canditer *restrict ci,
+           BAT *restrict r1, BAT *restrict r2, lng t0)
+{
+       BAT *bn;
+       BUN lo, hi;
+       oid r1seq, r1end;
+       oid r2seq, r2end;
+       BUN h1off;
+       BAT *r;
+       BUN off;
+       oid seq;
+       var_t v;
+
+       if ((bn = COLnew(l->hseqbase, TYPE_str, ci ? ci->ncand : BATcount(l),
+                        TRANSIENT)) != NULL)
+               return NULL;
+
+       v = (var_t) r1->tvheap->free;
+       if (r1->tvheap == r2->tvheap) {
+               h1off = 0;
+               BBPshare(bn->tvheap->parentid);
+               HEAPfree(bn->tvheap, true);
+               GDKfree(bn->tvheap);
+               bn->tvheap = r1->tvheap;
+       } else {
+               v = (v + GDK_VARALIGN - 1) & ~(GDK_VARALIGN - 1);
+               h1off = (BUN) v;
+               v += ((var_t) r2->tvheap->free + GDK_VARALIGN - 1) & 
~(GDK_VARALIGN - 1);
+               if (HEAPextend(bn->tvheap, v, false) != GDK_SUCCEED) {
+                       BBPreclaim(bn);
+                       return NULL;
+               }
+               memcpy(bn->tvheap->base, r1->tvheap->base, r1->tvheap->free);
+#ifndef NDEBUG
+               if (h1off > r1->tvheap->free)
+                       memset(bn->tvheap->base + r1->tvheap->free, 0, h1off - 
r1->tvheap->free);
+#endif
+               memcpy(bn->tvheap->base + h1off, r2->tvheap->base, 
r2->tvheap->free);
+       }
+
+       if (v >= ((var_t) 1 << (8 * bn->twidth)) &&
+           GDKupgradevarheap(bn, v, false, false) != GDK_SUCCEED) {
+               BBPreclaim(bn);
+               return NULL;
+       }
+
+       r1seq = r1->hseqbase;
+       r1end = r1seq + BATcount(r1);
+       r2seq = r2->hseqbase;
+       r2end = r2seq + BATcount(r2);
+       if (ci) {
+               for (lo = 0, hi = ci->ncand; lo < hi; lo++) {
+                       oid o = canditer_next(ci);
+                       if (o < r1seq || o >= r2end) {
+                               GDKerror("does not match always\n");
+                               return GDK_FAIL;
+                       }
+                       if (o < r1end) {
+                               r = r1;
+                               off = 0;
+                               seq = r1seq;
+                       } else {
+                               r = r2;
+                               off = h1off;
+                               seq = r2seq;
+                       }
+                       switch (r->twidth) {
+                       case 1:
+                               v = (var_t) ((uint8_t *) r->theap.base)[o - 
seq] + GDK_VAROFFSET;
+                               break;
+                       case 2:
+                               v = (var_t) ((uint16_t *) r->theap.base)[o - 
seq] + GDK_VAROFFSET;
+                               break;
+                       case 4:
+                               v = (var_t) ((uint32_t *) r->theap.base)[o - 
seq];
+                               break;
+                       case 8:
+                               v = (var_t) ((uint64_t *) r->theap.base)[o - 
seq];
+                               break;
+                       }
+                       v += off;
+                       switch (bn->twidth) {
+                       case 1:
+                               ((uint8_t *) bn->theap.base)[lo] = v - 
GDK_VAROFFSET;
+                               break;
+                       case 2:
+                               ((uint16_t *) bn->theap.base)[lo] = v - 
GDK_VAROFFSET;
+                               break;
+                       case 4:
+                               ((uint32_t *) bn->theap.base)[lo] = (uint32_t) 
v;
+                               break;
+                       case 8:
+                               ((uint64_t *) bn->theap.base)[lo] = (uint64_t) 
v;
+                               break;
+                       }
+               }
+       } else if (BATtdense(l)) {
+               for (lo = 0, hi = BATcount(l); lo < hi; lo++) {
+                       oid o = l->tseqbase + lo;
+                       if (o < r1seq || o >= r2end) {
+                               GDKerror("does not match always\n");
+                               return GDK_FAIL;
+                       }
+                       if (o < r1end) {
+                               r = r1;
+                               off = 0;
+                               seq = r1seq;
+                       } else {
+                               r = r2;
+                               off = h1off;
+                               seq = r2seq;
+                       }
+                       switch (r->twidth) {
+                       case 1:
+                               v = (var_t) ((uint8_t *) r->theap.base)[o - 
seq] + GDK_VAROFFSET;
+                               break;
+                       case 2:
+                               v = (var_t) ((uint16_t *) r->theap.base)[o - 
seq] + GDK_VAROFFSET;
+                               break;
+                       case 4:
+                               v = (var_t) ((uint32_t *) r->theap.base)[o - 
seq];
+                               break;
+                       case 8:
+                               v = (var_t) ((uint64_t *) r->theap.base)[o - 
seq];
+                               break;
+                       }
+                       v += off;
+                       switch (bn->twidth) {
+                       case 1:
+                               ((uint8_t *) bn->theap.base)[lo] = v - 
GDK_VAROFFSET;
+                               break;
+                       case 2:
+                               ((uint16_t *) bn->theap.base)[lo] = v - 
GDK_VAROFFSET;
+                               break;
+                       case 4:
+                               ((uint32_t *) bn->theap.base)[lo] = (uint32_t) 
v;
+                               break;
+                       case 8:
+                               ((uint64_t *) bn->theap.base)[lo] = (uint64_t) 
v;
+                               break;
+                       }
+               }
+       } else {
+               const oid *restrict ot = (const oid *) Tloc(l, 0);
+               for (lo = 0, hi = BATcount(l); lo < hi; lo++) {
+                       oid o = ot[lo];
+                       if (o < r1seq || o >= r2end) {
+                               GDKerror("does not match always\n");
+                               return GDK_FAIL;
+                       }
+                       if (o < r1end) {
+                               r = r1;
+                               off = 0;
+                               seq = r1seq;
+                       } else {
+                               r = r2;
+                               off = h1off;
+                               seq = r2seq;
+                       }
+                       switch (r->twidth) {
+                       case 1:
+                               v = (var_t) ((uint8_t *) r->theap.base)[o - 
seq] + GDK_VAROFFSET;
+                               break;
+                       case 2:
+                               v = (var_t) ((uint16_t *) r->theap.base)[o - 
seq] + GDK_VAROFFSET;
+                               break;
+                       case 4:
+                               v = (var_t) ((uint32_t *) r->theap.base)[o - 
seq];
+                               break;
+                       case 8:
+                               v = (var_t) ((uint64_t *) r->theap.base)[o - 
seq];
+                               break;
+                       }
+                       v += off;
+                       switch (bn->twidth) {
+                       case 1:
+                               ((uint8_t *) bn->theap.base)[lo] = v - 
GDK_VAROFFSET;
+                               break;
+                       case 2:
+                               ((uint16_t *) bn->theap.base)[lo] = v - 
GDK_VAROFFSET;
+                               break;
+                       case 4:
+                               ((uint32_t *) bn->theap.base)[lo] = (uint32_t) 
v;
+                               break;
+                       case 8:
+                               ((uint64_t *) bn->theap.base)[lo] = (uint64_t) 
v;
+                               break;
+                       }
+               }
+       }
+       BATsetcount(bn, lo);
+       bn->tsorted = bn->trevsorted = false;
+       bn->tnil = false;
+       bn->tnonil = r1->tnonil & r2->tnonil;
+       bn->tkey = false;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to