Changeset: 57ba6f8b90aa for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=57ba6f8b90aa
Modified Files:
        gdk/gdk_strimps.c
        gdk/gdk_strimps.h
        monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

First implementation of strimp header contruction


diffs (212 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -84,7 +84,56 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
 #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
 #define isNotIgnored(x) (!isIgnored(x))
 #define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2))
+#define indexToPair1(idx) (idx & 0xff00) >> 8
+#define indexToPair2(idx) (idx & 0xff)
+#define swp(_a, _i, _j, TPE)                   \
+       do {                                    \
+               TPE _t = ((TPE *)_a)[_i];       \
+               ((TPE *) _a)[_i] = ((TPE *) _a)[_j];    \
+               ((TPE *) _a)[_j] = _t;                  \
+       } while(0)
 
+static StrimpHeader *
+make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size)
+{
+       lng t0 = 0;
+       size_t i;
+       uint64_t max_counts[STRIMP_SIZE] = {0};
+       const size_t cmin_max = STRIMP_SIZE - 1;
+       size_t hidx;
+
+       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+
+       for(i = 0; i < STRIMP_SIZE; i++)
+               h->bytepairs[i] = 0;
+
+       for(i = 0; i < hist_size; i++) {
+               if (max_counts[cmin_max] < hist[i]) {
+                       max_counts[cmin_max] = hist[i];
+                       h->bytepairs[cmin_max] = i;
+                        for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > 
max_counts[hidx-1]; hidx--) {
+                               swp(max_counts, hidx, hidx-1, uint64_t);
+                               swp(h->bytepairs, hidx, hidx-1, uint16_t);
+                       }
+               }
+       }
+
+       for(i = 0; i < STRIMP_SIZE; i++) {
+               TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), 
indexToPair2(h->bytepairs[i]), max_counts[i]);
+       }
+
+       TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0);
+
+       return h;
+}
+
+
+/* static uint64_t */
+/* add_to_header(size_t idx, uint64_t count) */
+/* { */
+/*     while */
+/*     return GDK_SUCCEED; */
+/* } */
 /* Construct a histogram of pairs of bytes.
  *
  * Return the histogram in hist and the number of non-zero bins in
@@ -98,6 +147,7 @@ GDKstrimp_make_histogram(BAT *b, uint64_
        BUN i;
        BATiter bi;
        char *ptr, *s;
+       /* uint64_t cur_min = 0; */
 
        TRC_DEBUG_IF(ALGO) t0 = GDKusec();
        assert(b->ttype == TYPE_str);
@@ -131,12 +181,61 @@ GDKstrimp_make_histogram(BAT *b, uint64_
                                        if (hist[hi] == 0)
                                                (*nbins)++;
                                        hist[hi]++;
+                                       /* if (hist[hi] > cur_min) */
+                                       /*      cur_min = add_to_header(hi, 
hist[hi]); */
                                }
                        }
                }
        }
 
-       TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
+       TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0);
        GDKtracer_flush_buffer();
        return GDK_SUCCEED;
 }
+
+gdk_return
+GDKstrimp_make_header(BAT *b)
+{
+       uint64_t hist[STRIMP_HISTSIZE] = {0};
+       size_t nbins = 0;
+       StrimpHeader header;
+       if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, &nbins) != 
GDK_SUCCEED) {
+               return GDK_FAIL;
+       }
+
+       make_header(&header, hist, STRIMP_HISTSIZE);
+
+       return GDK_SUCCEED;
+}
+
+
+/* static uint8_t */
+/* lookup_index(StrimpHeader *h, uint16_t n) */
+/* { */
+/*     size_t i; */
+/*     for(i = 0; i < STRIMP_SIZE; i++) */
+/*             if(h->bytepairs[i] == n) */
+/*                     return i; */
+
+/*     return 0; */
+/* } */
+
+
+/* Given a strimp header and a string compute the bitstring of which
+ * digrams(byte pairs) are present in the string. The strimp header is a
+ * map from digram(byte pair) to index in the strimp.
+ */
+/* static uint64_t */
+/* GDKstrimp_make_bitstring(str s, StrimpHeader *h) */
+/* { */
+/*     uint64_t ret = 0; */
+/*     uint8_t pair_idx; */
+/*     char *it; */
+
+/*     for(it = s; *it != 0 && *(it+1) != 0; it++) { */
+/*             pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); */
+/*             ret |= 0x1 << pair_idx; */
+/*     } */
+
+/*     return ret; */
+/* } */
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -15,8 +15,15 @@
  * just handling ASCII and full UTF-8 support.
  */
 #define STRIMP_HISTSIZE 256*256
-
-gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
+#define STRIMP_SIZE 64
 
-gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins);
+typedef struct {
+       // TODO: find a better name for this
+       uint16_t bytepairs[STRIMP_SIZE];
+} StrimpHeader;
+
+gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); // Remove?
+gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins); // make static
+// gdk_export gdk_return GDKstrimp_make_header(StrimpHeader *h, uint64_t 
*hist, size_t hist_size); // make static
+gdk_export gdk_return GDKstrimp_make_header(BAT *b);
 #endif /* _GDK_STRIMPS_H_ */
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -341,7 +341,7 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt
  * String imprints.
  */
 static str
-CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+PATstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
        bat bid;
        BAT *b;
@@ -365,7 +365,7 @@ CMDstrimp_ndigrams(Client cntxt, MalBlkP
 }
 
 static str
-CMDstrimp_makehist(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+PATstrimp_makehist(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
        bat bid;
        BAT *b, *ob;
@@ -401,6 +401,24 @@ CMDstrimp_makehist(Client cntxt, MalBlkP
        return MAL_SUCCEED;
 }
 
+static str
+PATstrimp_makeheader(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+       bat bid;
+       BAT *b;
+       (void)cntxt;
+       (void)mb;
+
+       bid = *getArgReference_bat(stk, pci, 2);
+       if ((b = BATdescriptor(bid)) == NULL)
+               throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+
+       if(GDKstrimp_make_header(b) != GDK_SUCCEED)
+               throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
+
+       return MAL_SUCCEED;
+}
+
 
 #include "mel.h"
 mel_func batExtensions_init_funcs[] = {
@@ -432,9 +450,9 @@ mel_func batExtensions_init_funcs[] = {
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),batvarargany("ins",1))),
 
  /* String imprints */
- pattern("bat", "strimpNDigrams", CMDstrimp_ndigrams, false, "count digrams in 
a string bat", args(1,2,arg("",lng),batarg("b",str))),
- pattern("bat", "strimpHistogram", CMDstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), 
batarg("",lng),batarg("b",str))),
- //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", 
lng), batarg("b", str))),
+ pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams in 
a string bat", args(1,2,arg("",lng),batarg("b",str))),
+ pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), 
batarg("",lng),batarg("b",str))),
+ pattern("bat", "strimpHeader", PATstrimp_makeheader, false, "construct the 
strimp header from a BAT", args(1,2,arg("",void),batarg("b",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to