Changeset: 4f3cbb1ef6c7 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f3cbb1ef6c7 Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h Branch: string_imprints Log Message:
Byte pair histogram construction Count the occurrences of pairs of bytes. This is different than counting pairs of characters, unless the characters are ASCII. diffs (71 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -76,3 +76,41 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) return GDK_SUCCEED; } + +/* The isIgnored is a bit suspect in terms of unicode. There are + * non-ASCII codepoints that are considered spaces, for example the + * codepoints in the range U+2000-U+200f. + */ +#define isIgnored(x) isspace((x)) || isdigit((x)) +#define pairToIndex(b1, b2) (b1)<<8 | (b2) + +gdk_return +GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t *count) +{ + size_t hi; + BUN i; + BATiter bi; + uint8_t *ptr, *s; + assert(b->ttype == TYPE_str); + + for(hi = 0; hi < hist_size; hi++) + hist[hi] = 0; + + bi = bat_iterator(b); + *count = 0; + for(i = 0; i < b->batCount; i++) { + s = (uint8_t *)BUNtail(bi, i); + for(ptr = s; *(ptr + 1) != 0; ptr++) { + if (isIgnored(*ptr)) /* skip the current pair and the next at the end of the loop */ + ptr++; + else { + hi = pairToIndex(*(ptr), *(ptr+1)); + assert(hi < hist_size); + if (hist[hi] == 0) + (*count)++; + hist[hi]++; + } + } + } + return GDK_SUCCEED; +} diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -11,17 +11,12 @@ #include <stdint.h> -#define HISTSIZE 64 - -typedef struct { - uint64_t counts[HISTSIZE]; - char foo; -} Histogram; - -typedef struct { - Histogram* hist; -} Strimp; +/* Count the occurences of pairs of bytes. This is a compromise between + * just handling ASCII and full UTF-8 support. + */ +#define STRIMP_HISTSIZE 256*256 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); +gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t *count); #endif /* _GDK_STRIMPS_H_ */ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list