Changeset: e695149dd3ce for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e695149dd3ce Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message:
Add documentation and move things arround diffs (115 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -94,48 +94,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) ((TPE *) _a)[_j] = _t; \ } while(0) -static StrimpHeader * -make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size) -{ - lng t0 = 0; - size_t i; - uint64_t max_counts[STRIMP_SIZE] = {0}; - const size_t cmin_max = STRIMP_SIZE - 1; - size_t hidx; - - TRC_DEBUG_IF(ALGO) t0 = GDKusec(); - - for(i = 0; i < STRIMP_SIZE; i++) - h->bytepairs[i] = 0; - - for(i = 0; i < hist_size; i++) { - if (max_counts[cmin_max] < hist[i]) { - max_counts[cmin_max] = hist[i]; - h->bytepairs[cmin_max] = i; - for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { - swp(max_counts, hidx, hidx-1, uint64_t); - swp(h->bytepairs, hidx, hidx-1, uint16_t); - } - } - } - - for(i = 0; i < STRIMP_SIZE; i++) { - TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); - } - - TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0); - - return h; -} - - -/* static uint64_t */ -/* add_to_header(size_t idx, uint64_t count) */ -/* { */ -/* while */ -/* return GDK_SUCCEED; */ -/* } */ -/* Construct a histogram of pairs of bytes. +/* Construct a histogram of pairs of bytes in the input BAT. * * Return the histogram in hist and the number of non-zero bins in * count. @@ -194,8 +153,59 @@ GDKstrimp_make_histogram(BAT *b, uint64_ return GDK_SUCCEED; } -gdk_return -GDKstrimp_make_header(BAT *b) +/* Given a histogram find the indices of the 64 largest counts. + * + * We make one scan of histogram and every time we find a count that is + * greater than the current minimum of the 64, we bubble it up in the + * header until we find a count that is greater. We carry the index in + * the histogram because this is the information we are actually + * interested in keeping. + * + * At the end of this process we have the indices of 64 largest counts + * in the histogram. This process is O(n) in time since we are doing + * constant work (at most 63 comparisons and swaps) for each item in the + * histogram and as such is (theoretically) more efficient than sorting + * (O(nlog n))and taking the 64 largest elements. This depends on the + * size of the histogram n. For some small n sorting might be more + * efficient, but for such inputs the difference should not be + * noticeable. + * + * In the current implementation each index is a DataPair value that is + * constructed by pairToIndex from 2 consecutive bytes in the input. + */ +static StrimpHeader * +make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size) +{ + lng t0 = 0; + size_t i; + uint64_t max_counts[STRIMP_HEADER_SIZE] = {0}; + const size_t cmin_max = STRIMP_HEADER_SIZE - 1; + size_t hidx; + + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + + for(i = 0; i < STRIMP_HEADER_SIZE; i++) + h->bytepairs[i] = 0; + + for(i = 0; i < hist_size; i++) { + if (max_counts[cmin_max] < hist[i]) { + max_counts[cmin_max] = hist[i]; + h->bytepairs[cmin_max] = i; + for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { + swp(max_counts, hidx, hidx-1, uint64_t); + swp(h->bytepairs, hidx, hidx-1, DataPair); + } + } + } + + for(i = 0; i < STRIMP_HEADER_SIZE; i++) { + TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); + } + + TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0); + + return h; +} { uint64_t hist[STRIMP_HISTSIZE] = {0}; size_t nbins = 0; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list