Changeset: 4f3cbb1ef6c7 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f3cbb1ef6c7
Modified Files:
        gdk/gdk_strimps.c
        gdk/gdk_strimps.h
Branch: string_imprints
Log Message:

Byte pair histogram construction

Count the occurrences of pairs of bytes. This is different than
counting pairs of characters, unless the characters are ASCII.


diffs (71 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -76,3 +76,41 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
 
        return GDK_SUCCEED;
 }
+
+/* The isIgnored is a bit suspect in terms of unicode. There are
+ * non-ASCII codepoints that are considered spaces, for example the
+ * codepoints in the range U+2000-U+200f.
+ */
+#define isIgnored(x) isspace((x)) || isdigit((x))
+#define pairToIndex(b1, b2) (b1)<<8 | (b2)
+
+gdk_return
+GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t 
*count)
+{
+       size_t hi;
+       BUN i;
+       BATiter bi;
+       uint8_t *ptr, *s;
+       assert(b->ttype == TYPE_str);
+
+       for(hi = 0; hi < hist_size; hi++)
+               hist[hi] = 0;
+
+       bi = bat_iterator(b);
+       *count = 0;
+       for(i = 0; i < b->batCount; i++) {
+               s = (uint8_t *)BUNtail(bi, i);
+               for(ptr = s; *(ptr + 1) != 0; ptr++) {
+                       if (isIgnored(*ptr)) /* skip the current pair and the 
next at the end of the loop */
+                               ptr++;
+                       else {
+                               hi = pairToIndex(*(ptr), *(ptr+1));
+                               assert(hi < hist_size);
+                               if (hist[hi] == 0)
+                                       (*count)++;
+                               hist[hi]++;
+                       }
+               }
+       }
+       return GDK_SUCCEED;
+}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -11,17 +11,12 @@
 
 #include <stdint.h>
 
-#define HISTSIZE 64
-
-typedef struct {
-       uint64_t counts[HISTSIZE];
-       char foo;
-} Histogram;
-
-typedef struct {
-       Histogram* hist;
-} Strimp;
+/* Count the occurences of pairs of bytes. This is a compromise between
+ * just handling ASCII and full UTF-8 support.
+ */
+#define STRIMP_HISTSIZE 256*256
 
 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
+gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t 
hist_size, uint16_t *count);
 
 #endif /* _GDK_STRIMPS_H_ */
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to