Changeset: a7d6f1b2be59 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/a7d6f1b2be59 Branch: string_imprints Log Message:
Merge with default branch diffs (truncated from 560 to 300 lines): diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c --- a/gdk/gdk_group.c +++ b/gdk/gdk_group.c @@ -73,7 +73,7 @@ if (ngrp == maxgrps) { \ /* we need to extend extents and histo bats, */ \ /* do it at most once */ \ - maxgrps = BATcount(b); \ + maxgrps = bi.count; \ if (extents) { \ BATsetcount(en, ngrp); \ if (BATextend(en, maxgrps) != GDK_SUCCEED) \ @@ -965,15 +965,13 @@ BATgroup_internal(BAT **groups, BAT **ex /* byte-sized values, use 256 entry array to keep * track of doled out group ids; note that we can't * possibly have more than 256 groups, so the group id - * fits in an unsigned char */ - unsigned char *restrict bgrps = GDKmalloc(256); - const unsigned char *restrict w = (const unsigned char *) bi.base; - unsigned char v; + * fits in an uint8_t */ + uint8_t bgrps[256]; + const uint8_t *restrict w = (const uint8_t *) bi.base; + uint8_t v; algomsg = "byte-sized groups -- "; - if (bgrps == NULL) - goto error1; - memset(bgrps, 0xFF, 256); + memset(bgrps, 0xFF, sizeof(bgrps)); if (histo) memset(cnts, 0, maxgrps * sizeof(lng)); ngrp = 0; @@ -982,7 +980,7 @@ BATgroup_internal(BAT **groups, BAT **ex oid o = canditer_next(&ci); p = o - b->hseqbase; if ((v = bgrps[w[p]]) == 0xFF && ngrp < 256) { - bgrps[w[p]] = v = (unsigned char) ngrp++; + bgrps[w[p]] = v = (uint8_t) ngrp++; maxgrppos = r; if (extents) exts[v] = o; @@ -995,20 +993,19 @@ BATgroup_internal(BAT **groups, BAT **ex } TIMEOUT_CHECK(timeoffset, GOTO_LABEL_TIMEOUT_HANDLER(error)); - GDKfree(bgrps); } else if (g == NULL && t == TYPE_sht) { /* short-sized values, use 65536 entry array to keep * track of doled out group ids; note that we can't * possibly have more than 65536 groups, so the group - * id fits in an unsigned short */ - unsigned short *restrict sgrps = GDKmalloc(65536 * sizeof(short)); - const unsigned short *restrict w = (const unsigned short *) bi.base; - unsigned short v; + * id fits in an uint16_t */ + uint16_t *restrict sgrps = GDKmalloc(65536 * sizeof(short)); + const uint16_t *restrict w = (const uint16_t *) bi.base; + uint16_t v; algomsg = "short-sized groups -- "; if (sgrps == NULL) goto error1; - memset(sgrps, 0xFF, 65536 * sizeof(short)); + memset(sgrps, 0xFF, 65536 * sizeof(uint16_t)); if (histo) memset(cnts, 0, maxgrps * sizeof(lng)); ngrp = 0; @@ -1017,7 +1014,7 @@ BATgroup_internal(BAT **groups, BAT **ex oid o = canditer_next(&ci); p = o - b->hseqbase; if ((v = sgrps[w[p]]) == 0xFFFF && ngrp < 65536) { - sgrps[w[p]] = v = (unsigned short) ngrp++; + sgrps[w[p]] = v = (uint16_t) ngrp++; maxgrppos = r; if (extents) exts[v] = o; @@ -1190,7 +1187,7 @@ BATgroup_internal(BAT **groups, BAT **ex const bte *w = (bte *) bi.base; GRP_create_partial_hash_table_core( (void) 0, - (v = ((ulng)grps[r]<<8)|(unsigned char)w[p], hash_lng(hs, &v)), + (v = ((ulng)grps[r]<<8)|(uint8_t)w[p], hash_lng(hs, &v)), w[p] == w[hb] && grps[r] == grps[q], (void) 0, NOGRPTST); @@ -1207,7 +1204,7 @@ BATgroup_internal(BAT **groups, BAT **ex const sht *w = (sht *) bi.base; GRP_create_partial_hash_table_core( (void) 0, - (v = ((ulng)grps[r]<<16)|(unsigned short)w[p], hash_lng(hs, &v)), + (v = ((ulng)grps[r]<<16)|(uint16_t)w[p], hash_lng(hs, &v)), w[p] == w[hb] && grps[r] == grps[q], (void) 0, NOGRPTST); diff --git a/gdk/gdk_unique.c b/gdk/gdk_unique.c --- a/gdk/gdk_unique.c +++ b/gdk/gdk_unique.c @@ -32,17 +32,14 @@ BATunique(BAT *b, BAT *s) const char *vars; int width; oid i, o; - uint16_t *seen = NULL; const char *nme; Hash *hs = NULL; BUN hb; - BATiter bi; int (*cmp)(const void *, const void *); struct canditer ci; const char *algomsg = ""; lng t0 = 0; - size_t counter = 0; lng timeoffset = 0; QryCtx *qry_ctx = MT_thread_get_qry_ctx(); if (qry_ctx != NULL) { @@ -81,62 +78,46 @@ BATunique(BAT *b, BAT *s) assert(b->ttype != TYPE_void); + BATiter bi = bat_iterator(b); BUN initsize = BUN_NONE; if (s == NULL) { MT_rwlock_rdlock(&b->thashlock); if (b->thash != NULL && b->thash != (Hash *) 1) initsize = b->thash->nunique; MT_rwlock_rdunlock(&b->thashlock); - if (initsize == BUN_NONE) { - MT_lock_set(&b->theaplock); - if (b->tunique_est != 0) - initsize = (BUN) b->tunique_est; - MT_lock_unset(&b->theaplock); - } + if (initsize == BUN_NONE && bi.unique_est != 0) + initsize = (BUN) bi.unique_est; } if (initsize == BUN_NONE) initsize = 1024; bn = COLnew(0, TYPE_oid, initsize, TRANSIENT); - if (bn == NULL) + if (bn == NULL) { + bat_iterator_end(&bi); return NULL; - bi = bat_iterator(b); + } vals = bi.base; - if (b->tvarsized && b->ttype) + if (b->tvarsized && bi.type) vars = bi.vh->base; else vars = NULL; width = bi.width; - cmp = ATOMcompare(b->ttype); + cmp = ATOMcompare(bi.type); - if (BATordered(b) || BATordered_rev(b)) { - const void *prev = NULL; - algomsg = "unique: sorted"; - for (i = 0; i < cnt; i++) { - GDK_CHECK_TIMEOUT(timeoffset, counter, - GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); - o = canditer_next(&ci); - v = VALUE(o - b->hseqbase); - if (prev == NULL || (*cmp)(v, prev) != 0) { - if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) - goto bunins_failed; - } - prev = v; - } - } else if (ATOMbasetype(b->ttype) == TYPE_bte) { - unsigned char val; + if (ATOMbasetype(bi.type) == TYPE_bte || + (bi.width == 1 && + ATOMstorage(bi.type) == TYPE_str && + GDK_ELIMDOUBLES(bi.vh))) { + uint8_t val; algomsg = "unique: byte-sized atoms"; - assert(vars == NULL); - seen = GDKzalloc((256 / 16) * sizeof(seen[0])); - if (seen == NULL) - goto bunins_failed; - for (i = 0; i < cnt; i++) { - GDK_CHECK_TIMEOUT(timeoffset, counter, - GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); + uint32_t seen[256 >> 5]; + memset(seen, 0, sizeof(seen)); + TIMEOUT_LOOP_IDX(i, cnt, timeoffset) { o = canditer_next(&ci); - val = ((const unsigned char *) vals)[o - b->hseqbase]; - if (!(seen[val >> 4] & (1U << (val & 0xF)))) { - seen[val >> 4] |= 1U << (val & 0xF); + val = ((const uint8_t *) vals)[o - b->hseqbase]; + uint32_t m = UINT32_C(1) << (val & 0x1F); + if (!(seen[val >> 5] & m)) { + seen[val >> 5] |= m; if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) goto bunins_failed; if (bn->batCount == 256) { @@ -146,23 +127,23 @@ BATunique(BAT *b, BAT *s) } } } - GDKfree(seen); - seen = NULL; - } else if (ATOMbasetype(b->ttype) == TYPE_sht) { - unsigned short val; + TIMEOUT_CHECK(timeoffset, + GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); + } else if (ATOMbasetype(bi.type) == TYPE_sht || + (bi.width == 2 && + ATOMstorage(bi.type) == TYPE_str && + GDK_ELIMDOUBLES(bi.vh))) { + uint16_t val; algomsg = "unique: short-sized atoms"; - assert(vars == NULL); - seen = GDKzalloc((65536 / 16) * sizeof(seen[0])); - if (seen == NULL) - goto bunins_failed; - for (i = 0; i < cnt; i++) { - GDK_CHECK_TIMEOUT(timeoffset, counter, - GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); + uint32_t seen[65536 >> 5]; + memset(seen, 0, sizeof(seen)); + TIMEOUT_LOOP_IDX(i, cnt, timeoffset) { o = canditer_next(&ci); - val = ((const unsigned short *) vals)[o - b->hseqbase]; - if (!(seen[val >> 4] & (1U << (val & 0xF)))) { - seen[val >> 4] |= 1U << (val & 0xF); + val = ((const uint16_t *) vals)[o - b->hseqbase]; + uint32_t m = UINT32_C(1) << (val & 0x1F); + if (!(seen[val >> 5] & m)) { + seen[val >> 5] |= m; if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) goto bunins_failed; if (bn->batCount == 65536) { @@ -172,11 +153,25 @@ BATunique(BAT *b, BAT *s) } } } - GDKfree(seen); - seen = NULL; + TIMEOUT_CHECK(timeoffset, + GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); + } else if (BATordered(b) || BATordered_rev(b)) { + const void *prev = NULL; + algomsg = "unique: sorted"; + TIMEOUT_LOOP_IDX(i, cnt, timeoffset) { + o = canditer_next(&ci); + v = VALUE(o - b->hseqbase); + if (prev == NULL || (*cmp)(v, prev) != 0) { + if (bunfastappTYPE(oid, bn, &o) != GDK_SUCCEED) + goto bunins_failed; + } + prev = v; + } + TIMEOUT_CHECK(timeoffset, + GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); } else if (BATcheckhash(b) || (!b->batTransient && - cnt == BATcount(b) && + cnt == bi.count && BAThash(b) == GDK_SUCCEED)) { BUN lo = 0; oid seq; @@ -192,9 +187,7 @@ BATunique(BAT *b, BAT *s) MT_rwlock_rdunlock(&b->thashlock); goto lost_hash; } - for (i = 0; i < cnt; i++) { - GDK_CHECK_TIMEOUT(timeoffset, counter, - GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); + TIMEOUT_LOOP_IDX(i, cnt, timeoffset) { BUN p; o = canditer_next(&ci); @@ -220,6 +213,8 @@ BATunique(BAT *b, BAT *s) } } MT_rwlock_rdunlock(&b->thashlock); + TIMEOUT_CHECK(timeoffset, + GOTO_LABEL_TIMEOUT_HANDLER(bunins_failed)); } else { BUN prb; BUN p; @@ -229,10 +224,10 @@ BATunique(BAT *b, BAT *s) GDKclrerr(); /* not interested in BAThash errors */ algomsg = "unique: new partial hash"; nme = BBP_physical(b->batCacheid); - if (ATOMbasetype(b->ttype) == TYPE_bte) { + if (ATOMbasetype(bi.type) == TYPE_bte) { mask = (BUN) 1 << 8; cmp = NULL; /* no compare needed, "hash" is perfect */ - } else if (ATOMbasetype(b->ttype) == TYPE_sht) { + } else if (ATOMbasetype(bi.type) == TYPE_sht) { mask = (BUN) 1 << 16; cmp = NULL; /* no compare needed, "hash" is perfect */ } else { @@ -244,19 +239,17 @@ BATunique(BAT *b, BAT *s) _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list