Changeset: 2e4b7358231f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2e4b7358231f Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message:
Basic correct implementation Do not miss anything, do not allow ignored characters. diffs (64 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -81,16 +81,25 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) * non-ASCII codepoints that are considered spaces, for example the * codepoints in the range U+2000-U+200f. */ -#define isIgnored(x) isspace((x)) || isdigit((x)) -#define pairToIndex(b1, b2) (b1)<<8 | (b2) +#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) +#define isNotIgnored(x) (!isIgnored(x)) +#define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2)) +/* Construct a histogram of pairs of bytes. + * + * Return the histogram in hist and the number of non-zero bins in + * count. + */ gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count) { + lng t0; size_t hi; BUN i; BATiter bi; - uint8_t *ptr, *s; + char *ptr, *s; + + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); assert(b->ttype == TYPE_str); for(hi = 0; hi < hist_size; hi++) @@ -99,18 +108,21 @@ GDKstrimp_makehistogram(BAT *b, uint64_t bi = bat_iterator(b); *count = 0; for(i = 0; i < b->batCount; i++) { - s = (uint8_t *)BUNtail(bi, i); - for(ptr = s; *(ptr + 1) != 0; ptr++) { - if (isIgnored(*ptr)) /* skip the current pair and the next at the end of the loop */ - ptr++; - else { - hi = pairToIndex(*(ptr), *(ptr+1)); - assert(hi < hist_size); - if (hist[hi] == 0) - (*count)++; - hist[hi]++; + s = (char *)BUNtvar(bi, i); + if (!strNil(s)) { + for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) { + if (isNotIgnored(*ptr) && isNotIgnored(*(ptr+1))) { + hi = pairToIndex(*(ptr), *(ptr+1)); + assert(hi < hist_size); + if (hist[hi] == 0) + (*count)++; + hist[hi]++; + } } } } + + TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + GDKtracer_flush_buffer(); return GDK_SUCCEED; } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list