Changeset: 63aecf69eb6a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/63aecf69eb6a Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message:
Update comment diffs (173 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -12,11 +12,10 @@ * A string imprint is an index that can be used as a prefilter in LIKE * queries. It has 2 components: * - * - a header of 32 or 64 string element pairs. + * - a header of 64 string element pairs. * - * - a 32 or 64 bit mask for each string in the BAT that encodes the - * presence or absence of each element of the header in the specific - * item. + * - a 64 bit mask for each string in the BAT that encodes the presence + * or absence of each element of the header in the specific item. * * A string imprint is stored in a new Heap in the BAT, aligned in 8 * byte (64 bit) words. @@ -24,40 +23,45 @@ * The first 64 bit word, the header descriptor, describes how the * header of the strimp is encoded. The least significant byte (v in the * schematic below) is the version number. The second (np) is the number - * of pairs in the header. The next 2 bytes (hs) is the size of the - * header in bytes. Finally the fifth byte is the persistence byte. The - * last 3 bytes needed to align to the 8 byte boundary should be zero, - * and are reserved for future use. + * of pairs in the header. In the current implementation this is always + * 64. The next 2 bytes (hs) is the total size of the header in + * bytes. Finally the fifth byte is the persistence byte. The last 3 + * bytes needed to align to the 8 byte boundary should be zero, and are + * reserved for future use. * * The following np bytes are the sizes of the pairs. These can have * values from 2 to 8 and are the number of bytes that the corresponding * pair takes up. Following that there are the bytes encoding the actual * pairs. * - * | v | np | hs | p | reserved | 8bytes - * | | --- - * Strimp Header | - * | psz_0 | psz_1 | ... | | - * | | --- | - * | |np bytes | - * | ... | psz_n | --- hs bytes - * | pair_0 | pair_1 | | - * | ... | | - * | pair_k-1 | pair_k | | - * | pair_n | | - * | | --- + * | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | + * |---------------------------------------------------------------| + * | v | np | hs | p | reserved | 8bytes --- + * |---------------------------------------------------------------| ___ | + * | psz_0 | psz_1 | ... | | | + * | | | | + * | |np bytes | + * | | | | + * | ... | psz_n | | hs bytes + * |---------------------------------------------------------------| ___ | + * | pair_0 | pair_1 | | + * | ... | | + * | pair_k-1 | pair_k | | + * | pair_n | | + * |---------------------------------------------------------------| --- * * - * The bitmasks for each string in the BAT follow after this. + * The bitmasks for each string in the BAT follow after this, aligned to + * the string BAT. * * Strimp creation goes as follows: * * - Construct a histogram of the element (byte or character) pairs for * all the strings in the BAT. * - * - Take the 32/64 most frequent pairs as the Strimp Header. + * - Take the 64 most frequent pairs as the Strimp Header. * - * - For each string in the bat construct a 32/64 bit mask that encodes + * - For each string in the bat construct a 64 bit mask that encodes * the presence or absence of each member of the header in the string. */ @@ -80,8 +84,8 @@ #define NPAIRS(d) (((d) >> 8) & 0xff) #define HSIZE(d) (((d) >> 16) & 0xffff) -#undef UTF8STRINGS /* Not using utf8 for now */ -#ifdef UTF8STRINGS +#undef UTF8STRIMPS /* Not using utf8 for now */ +#ifdef UTF8STRIMPS static bool pair_equal(CharPair *p1, CharPair *p2) { if(p1->psize != p2->psize) @@ -94,7 +98,11 @@ pair_equal(CharPair *p1, CharPair *p2) { return true; } #else -/* BytePairs implementation */ +/* BytePairs implementation. + * + * All the of the following functions and macros up to #endif need to be + * implemented for the UTF8 case. + */ #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) #define pairToIndex(b1, b2) (uint16_t)(((uint16_t)b2)<<8 | ((uint16_t)b1)) @@ -129,7 +137,7 @@ next_pair(PairIterator *pi) { return true; } -#endif // UTF8STRINGS +#endif // UTF8STRIMPS static int8_t STRMPpairLookup(Strimps *s, CharPair *p) { @@ -155,8 +163,6 @@ ignored(CharPair *p, uint8_t elm) { return isIgnored(p->pbytes[elm]); } -#define MAX_PAIR_SIZE 8 - /* Given a strimp header and a string compute the bitstring of which * digrams are present in the string. The strimp header is a map from * digram to index in the strimp. @@ -202,9 +208,6 @@ STRMPmakebitstring(const str s, Strimps * largest elements. This depends on the size of the histogram n. For * some small n sorting might be more efficient, but for such inputs the * difference should not be noticeable. - * - * In the current implementation each index is a DataPair value that is - * constructed by pairToIndex from 2 consecutive bytes in the input. */ static void STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp) @@ -330,7 +333,9 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai return true; } -/* Create the heap for a string imprint. Returns NULL on failure. */ +/* Create the heap for a string imprint. Returns NULL on failure. This + * follows closely the Heap creation for the order index. + */ static Strimps * STRMPcreateStrimpHeap(BAT *b, BAT *s) { @@ -387,9 +392,15 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) return b->tstrimps; } -#define STRIMP_COMPLETE(b) \ - b->tstrimps != NULL &&\ - (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount +/* This macro takes a bat and checks if the strimp construction has been + * completed. It is completed when the strimp pointer is not null and it + * is either 1 (i.e. it exists on disk) or the number of bitstrings + * computed is the same as the number of elements in the BAT. + */ +#define STRIMP_COMPLETE(b) \ + b->tstrimps != NULL && \ + (b->tstrimps == (Strimps *)1 || \ + (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount) static bool BATcheckstrimps(BAT *b) @@ -441,6 +452,7 @@ BATcheckstrimps(BAT *b) && (hsize = HSIZE(desc)) >= 200 && hsize <= 584 && ((desc >> 32) & 0xff) == 1 /* check the persistence byte */ && fstat(fd, &st) == 0 + /* TODO: We might need padding in the UTF-8 case. */ && st.st_size >= (off_t) (hp->strimps.free = hp->strimps.size = /* header size (desc + offsets + pairs) */ hsize + _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list