Changeset: 2e4b7358231f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2e4b7358231f
Modified Files:
        gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Basic correct implementation

Do not miss anything, do not allow ignored characters.


diffs (64 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -81,16 +81,25 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
  * non-ASCII codepoints that are considered spaces, for example the
  * codepoints in the range U+2000-U+200f.
  */
-#define isIgnored(x) isspace((x)) || isdigit((x))
-#define pairToIndex(b1, b2) (b1)<<8 | (b2)
+#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
+#define isNotIgnored(x) (!isIgnored(x))
+#define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2))
 
+/* Construct a histogram of pairs of bytes.
+ *
+ * Return the histogram in hist and the number of non-zero bins in
+ * count.
+ */
 gdk_return
 GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*count)
 {
+       lng t0;
        size_t hi;
        BUN i;
        BATiter bi;
-       uint8_t *ptr, *s;
+       char *ptr, *s;
+
+       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
        assert(b->ttype == TYPE_str);
 
        for(hi = 0; hi < hist_size; hi++)
@@ -99,18 +108,21 @@ GDKstrimp_makehistogram(BAT *b, uint64_t
        bi = bat_iterator(b);
        *count = 0;
        for(i = 0; i < b->batCount; i++) {
-               s = (uint8_t *)BUNtail(bi, i);
-               for(ptr = s; *(ptr + 1) != 0; ptr++) {
-                       if (isIgnored(*ptr)) /* skip the current pair and the 
next at the end of the loop */
-                               ptr++;
-                       else {
-                               hi = pairToIndex(*(ptr), *(ptr+1));
-                               assert(hi < hist_size);
-                               if (hist[hi] == 0)
-                                       (*count)++;
-                               hist[hi]++;
+               s = (char *)BUNtvar(bi, i);
+               if (!strNil(s)) {
+                       for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) {
+                               if (isNotIgnored(*ptr) && 
isNotIgnored(*(ptr+1))) {
+                                       hi = pairToIndex(*(ptr), *(ptr+1));
+                                       assert(hi < hist_size);
+                                       if (hist[hi] == 0)
+                                               (*count)++;
+                                       hist[hi]++;
+                               }
                        }
                }
        }
+
+       TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
+       GDKtracer_flush_buffer();
        return GDK_SUCCEED;
 }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to