Changeset: 54c5414c518a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/54c5414c518a Branch: default Log Message:
Merge with branch strimps-updates diffs (truncated from 680 to 300 lines): diff --git a/ChangeLog.strimps-updates b/ChangeLog.strimps-updates new file mode 100644 --- /dev/null +++ b/ChangeLog.strimps-updates @@ -0,0 +1,10 @@ +# ChangeLog file for strimpsv2 +# This file is updated with Maddlog + +* Mon Jul 17 2023 Panagiotis Koutsourakis <kutsu...@monetdbsolutions.com> +- Implemented direct masking for strimp construction. The strimps + datastructure now keeps an array of 65K 64-bit integers that is zero + everywhere except at the indexes that correspond to header pairs. The + entry for the nth pair in order has the nth bit of the bitstring + on. These can be used to quickly construct bitstrings. + diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt --- a/gdk/CMakeLists.txt +++ b/gdk/CMakeLists.txt @@ -89,7 +89,7 @@ target_sources(bat gdk_analytic.h gdk_tracer.c gdk_tracer.h gdk_rtree.c - gdk_strimps.c gdk_strimps.h + gdk_strimps.c PUBLIC ${gdk_public_headers}) diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -456,15 +456,20 @@ struct Imprints { BUN dictcnt; /* counter for cache dictionary */ }; +typedef uint64_t strimp_masks_t; /* TODO: make this a sparse matrix */ + struct Strimps { Heap strimps; uint8_t *sizes_base; /* pointer into strimps heap (pair sizes) */ uint8_t *pairs_base; /* pointer into strimps heap (pairs start) */ - void *bitstrings_base; /* pointer into strimps heap (bitstrings start) */ + void *bitstrings_base; /* pointer into strimps heap (bitstrings + * start) bitstrings_base is a pointer + * to uint64_t */ size_t rec_cnt; /* reconstruction counter: how many - bitstrings were added after header - construction */ - /* bitstrings_base is a pointer to uint64_t */ + * bitstrings were added after header + * construction. Currently unused. */ + strimp_masks_t *masks; /* quick access to masks for + * bitstring construction */ }; #ifdef HAVE_RTREE diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -85,7 +85,38 @@ #include "gdk.h" #include "gdk_private.h" -#include "gdk_strimps.h" +#include <stdint.h> +#include <inttypes.h> + +#define STRIMP_VERSION (uint64_t)2 +#define STRIMP_HISTSIZE (256*256) +#define STRIMP_HEADER_SIZE 64 +#define STRIMP_PAIRS (STRIMP_HEADER_SIZE - 1) +#define STRIMP_CREATION_THRESHOLD \ + ((BUN) ((ATOMIC_GET(&GDKdebug) & FORCEMITOMASK)? 100 : 5000)) + +typedef struct { +#ifdef UTF8STRIMPS + uint8_t *pbytes; +#else + uint8_t pbytes[2]; +#endif //UTF8STRIMPSX + uint8_t psize; + size_t idx; + uint64_t mask; +} CharPair; + +typedef struct { + size_t pos; + size_t lim; + const char *s; +} PairIterator; + +typedef struct { + uint64_t cnt; + uint64_t mask; + size_t idx; +} PairHistogramElem; /* Macros for accessing metadada of a strimp. These are recorded in the @@ -122,20 +153,10 @@ pair_equal(CharPair *p1, CharPair *p2) { #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) #define pairToIndex(b1, b2) (size_t)(((uint16_t)b2)<<8 | ((uint16_t)b1)) -inline static bool -pair_equal(CharPair *p1, CharPair *p2) -{ - return p1->pbytes[0] == p2->pbytes[0] && - p1->pbytes[1] == p2->pbytes[1]; - -} - inline static size_t -histogram_index(PairHistogramElem *hist, size_t hsize, CharPair *p) -{ - (void) hist; - (void) hsize; - return pairToIndex(p->pbytes[0], p->pbytes[1]); +bytes2histindex(uint8_t *bytes, uint8_t psize) { + (void)psize; + return pairToIndex(bytes[0], bytes[1]); } inline static bool @@ -143,8 +164,12 @@ pair_at(PairIterator *pi, CharPair *p) { if (pi->pos >= pi->lim - 1) return false; - p->pbytes = (uint8_t*)pi->s + pi->pos; + + p->pbytes[0] = (uint8_t)tolower(*(pi->s + pi->pos)); + p->pbytes[1] = (uint8_t)tolower(*(pi->s + pi->pos + 1)); + p->psize = 2; + p->idx = pairToIndex(p->pbytes[0], p->pbytes[1]); return true; } @@ -166,35 +191,31 @@ ignored(CharPair *p, uint8_t elm) return isIgnored(p->pbytes[elm]); } -#endif // UTF8STRIMPS +static strimp_masks_t +STRMPget_mask(Strimps *r, uint64_t idx) +{ + return r->masks[idx]; +} -/* Looks up a given pair in the strimp header. Returns the index of the - * pair, or -1 if it is not found. - * - * NOTE: This routine assumes that there are no more than 128 pairs. +static void +STRMPset_mask(Strimps *r, uint64_t idx, strimp_masks_t val) +{ + r->masks[idx] = val; +} + +/* Looks up a given pair in the strimp header and returns the index as a + * 64 bit integer. The return value has a bit on in the position + * corresponding to the index of the pair in the strimp header, or is 0 + * if the pair does not occur. */ -static int8_t +static uint64_t STRMPpairLookup(Strimps *s, CharPair *p) { - size_t idx = 0; - size_t npairs = NPAIRS(((uint64_t *)s->strimps.base)[0]); - size_t offset = 0; - CharPair sp; - - // The return type implies that we have no more than 128 pairs - // in the header. - assert(npairs <= 128); + return STRMPget_mask(s, pairToIndex(p->pbytes[0], p->pbytes[1])); +} - for (idx = 0; idx < npairs; idx++) { - sp.psize = s->sizes_base[idx]; - sp.pbytes = s->pairs_base + offset; - if (pair_equal(&sp, p)) - return (int8_t)idx; - offset += sp.psize; - } - return -1; -} +#endif // UTF8STRIMPS /* Computes the bitstring of a string s with respect to the strimp r. @@ -204,7 +225,7 @@ static uint64_t STRMPmakebitstring(const char *s, Strimps *r) { uint64_t ret = 0; - int8_t pair_idx = 0; + /* int8_t pair_idx = 0; */ PairIterator pi; CharPair cp; @@ -217,9 +238,7 @@ STRMPmakebitstring(const char *s, Strimp } while(pair_at(&pi, &cp)) { - pair_idx = STRMPpairLookup(r, &cp); - if (pair_idx >= 0) - ret |= ((uint64_t)0x1 << pair_idx); + ret |= STRMPpairLookup(r, &cp); next_pair(&pi); } @@ -233,24 +252,8 @@ STRMPmakebitstring(const char *s, Strimp ((TPE *) _a)[_j] = _t; \ } while(0) -/* Finds the indices of the STRIMP_HEADER_SIZE largest counts in a given - * a histogram. It returns them in the cp pointer. - * - * We make one scan of histogram and every time we find a count that is - * greater than the current minimum of the STRIMP_HEADER_SIZE, we bubble - * it up in the header until we find a count that is greater. We carry - * the index in the histogram because this is the information we are - * actually interested in keeping. - * - * At the end of this process we have the indices of STRIMP_HEADER_SIZE - * largest counts in the histogram. This process is O(n) in time since - * we are doing constant work (at most STRIMP_HEADER_SIZE-1 comparisons - * and swaps) for each item in the histogram and as such is - * (theoretically) more efficient than sorting (O(nlog n))and taking the - * STRIMP_HEADER_SIZE largest elements. This depends on the size of the - * histogram n. For some small n sorting might be more efficient, but - * for such inputs the difference should not be noticeable. - */ + + static void STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp) { @@ -275,8 +278,11 @@ STRMPchoosePairs(PairHistogramElem *hist } for(i = 0; i < STRIMP_HEADER_SIZE; i++) { - cp[i].pbytes = hist[indices[i]].p->pbytes; - cp[i].psize = hist[indices[i]].p->psize; + cp[i].pbytes[1] = (uint8_t)(hist[indices[i]].idx & 0xFF); + cp[i].pbytes[0] = (uint8_t)((hist[indices[i]].idx >> 8) & 0xFF); + cp[i].idx = hist[indices[i]].idx; + cp[i].psize = 2; + cp[i].mask = ((uint64_t)0x1) << (STRIMP_PAIRS - i - 1); } TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0); @@ -313,10 +319,16 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai } hlen = STRIMP_HISTSIZE; - if ((hist = (PairHistogramElem *)GDKzalloc(hlen*sizeof(PairHistogramElem))) == NULL) { + if ((hist = (PairHistogramElem *)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) { return false; } + for(hidx = 0; hidx < hlen; hidx++) { + hist[hidx].cnt = 0; + hist[hidx].mask = 0; + hist[hidx].idx = hidx; + } + // Create Histogram bi = bat_iterator(b); pip = π @@ -349,7 +361,8 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai ; } else { - hidx = histogram_index(hist, hlen, cpp); + /* hidx = histogram_index(hist, hlen, cpp); */ + hidx = cpp->idx; #ifndef UTF8STRINGS assert(hidx < hlen); #else @@ -358,20 +371,9 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai continue; } #endif - hist[hidx].cnt++; - if (hist[hidx].p == NULL) { + if (!hist[hidx].cnt) values++; - hist[hidx].p = (CharPair *)GDKmalloc(sizeof(CharPair)); - if (!hist[hidx].p) { - bat_iterator_end(&bi); - for (hidx = 0; hidx < hlen; hidx++) - GDKfree(hist[hidx].p); - GDKfree(hist); - return false; - } - hist[hidx].p->psize = cpp->psize; - hist[hidx].p->pbytes = cpp->pbytes; - } + hist[hidx].cnt++; _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org