Changeset: fa263cc6a470 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/fa263cc6a470
Modified Files:
        gdk/gdk.h
        gdk/gdk_private.h
        gdk/gdk_strimps.c
        gdk/gdk_strimps.h
        monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Read persistent strimp [WIP]


diffs (truncated from 361 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -568,6 +568,7 @@ typedef struct {
 
 typedef struct Hash Hash;
 typedef struct Imprints Imprints;
+typedef struct Strimps Strimps;
 
 /*
  * @+ Binary Association Tables
@@ -732,7 +733,7 @@ typedef struct {
        Hash *hash;             /* hash table */
        Imprints *imprints;     /* column imprints index */
        Heap *orderidx;         /* order oid index */
-       Heap *strimps;          /* string imprint index  */
+       Strimps *strimps;       /* string imprint index  */
 
        PROPrec *props;         /* list of dynamic properties stored in the bat 
descriptor */
 } COLrec;
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -18,6 +18,9 @@
 /* persist order index heaps for persistent BATs */
 #define PERSISTENTIDX 1
 
+/* persist strimp heaps for persistent BATs */
+#define PERSISTENTSTRIMP 1
+
 #include "gdk_system_private.h"
 
 enum heaptype {
@@ -384,6 +387,15 @@ struct Imprints {
        BUN dictcnt;            /* counter for cache dictionary               */
 };
 
+struct Strimps {
+       Heap strimps;
+       void *offsets_base;     /* pointer into strimps heap (pair offsets)  */
+       /* offsets_base is a pointer to either a uint8_t or a uint16_ */
+       uint8_t *pairs_base;    /* pointer into strimps heap (pairs start)   */
+       void *strimps_base;     /* pointer into strimps heap (strimps start) */
+       /* strimps_base is a pointer to either a uint32_t or a uint64_t */
+};
+
 typedef struct {
        MT_Lock swap;
 } batlock_t;
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -91,7 +91,6 @@ STRMP_strlen(const uint8_t *s)
 
        return ret;
 }
-#endif
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
@@ -129,6 +128,7 @@ STRMPndigrams(BAT *b, size_t *n)
 
        return GDK_SUCCEED;
 }
+#endif
 
 /* The isIgnored is a bit suspect in terms of unicode. There are
  * non-ASCII codepoints that are considered spaces, for example the
@@ -151,8 +151,8 @@ STRMPndigrams(BAT *b, size_t *n)
  * Return the histogram in hist and the number of non-zero bins in
  * count.
  */
-gdk_return
-STRMPmakehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
+static gdk_return
+STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
 {
        lng t0=0;
        size_t hi;
@@ -269,7 +269,7 @@ create_header(BAT *b)
        if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL)
                return NULL;
 
-       if(STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, &nbins) != GDK_SUCCEED) 
{
+       if(STRMPmakehistogramBP(b, hist, STRIMP_HISTSIZE, &nbins) != 
GDK_SUCCEED) {
                GDKfree(header);
                return NULL;
        }
@@ -324,11 +324,11 @@ STRMPmakebitstring(const str s, StrimpHe
 }
 
 /* Create the heap for a string imprint. Returns NULL on failure. */
-static Heap *
-create_strimp_heap(BAT *b, StrimpHeader *h)
+static Strimps *
+create_strimp(BAT *b, StrimpHeader *h)
 {
-       Heap *r = NULL;
        uint64_t *d;
+       Strimps *r = NULL;
        uint64_t descriptor;
        uint64_t npairs, bytes_per_pair, hsize;
        size_t i;
@@ -336,15 +336,15 @@ create_strimp_heap(BAT *b, StrimpHeader 
        const char *nme;
 
        nme = GDKinmemory(b->theap->farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
-       if ((r = GDKzalloc(sizeof(Heap))) == NULL ||
-           (r->farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 ||
-           strconcat_len(r->filename, sizeof(r->filename),
-                         nme, ".strimp", NULL) >= sizeof(r->filename) ||
-           HEAPalloc(r, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t), 0) != 
GDK_SUCCEED) {
+       if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
+           (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, 
strimpheap)) < 0 ||
+           strconcat_len(r->strimps.filename, sizeof(r->strimps.filename),
+                         nme, ".strimp", NULL) >= sizeof(r->strimps.filename) 
||
+           HEAPalloc(&r->strimps, BATcount(b) + STRIMP_OFFSET, 
sizeof(uint64_t), 0) != GDK_SUCCEED) {
                GDKfree(r);
                return NULL;
        }
-       r->free = STRIMP_OFFSET * sizeof(uint64_t);
+       r->strimps.free = STRIMP_OFFSET * sizeof(uint64_t);
 
        npairs = STRIMP_HEADER_SIZE;
        bytes_per_pair = 2;     /* Bytepair implementation */
@@ -353,9 +353,9 @@ create_strimp_heap(BAT *b, StrimpHeader 
        assert(bytes_per_pair == 0 || npairs*bytes_per_pair == hsize);
 
        descriptor = 0;
-       descriptor =  STRIMP_VERSION | npairs << 8 | bytes_per_pair << 16 | 
hsize << 24;
+       descriptor =  STRIMP_VERSION | npairs << 8;
 
-       d = (uint64_t *)r->base;
+       d = (uint64_t *)r->strimps.base;
        *d++ = descriptor;
        /* This loop assumes that we are working with byte pairs
         * (i.e. the type of the header is uint16_t). TODO: generalize.
@@ -368,15 +368,135 @@ create_strimp_heap(BAT *b, StrimpHeader 
                }
                d++;
        }
-#ifndef NDEBUG
-       FILE *fp = fopen("/tmp/foo.strimp", "wb");
-       fwrite(r->base, sizeof(uint64_t), STRIMP_HEADER_SIZE/4 + 1, fp);
-       fclose(fp);
-#endif
 
        return r;
 }
 
+
+static bool
+BATcheckstrimps(BAT *b)
+{
+       bool ret;
+       lng t = GDKusec();
+
+       if (b->tstrimps == (Strimps *)1) {
+               assert(!GDKinmemory(b->theap->farmid));
+               MT_lock_set(&b->batIdxLock);
+               if (b->tstrimps == (Strimps *)1) {
+                       Strimps *hp;
+                       const char *nme = BBP_physical(b->batCacheid);
+                       int fd;
+
+                       b->tstrimps = NULL;
+                       if ((hp = GDKzalloc(sizeof(Strimps))) != NULL &&
+                           (hp->strimps.farmid = BBPselectfarm(b->batRole, 
b->ttype, strimpheap)) >= 0) {
+                               strconcat_len(hp->strimps.filename,
+                                             sizeof(hp->strimps.filename),
+                                             nme, ".tstrimps", NULL);
+
+                               /* check whether a persisted strimp can be 
found */
+                               if ((fd = GDKfdlocate(hp->strimps.farmid, nme, 
"rb+", "tstrimps")) >= 0) {
+                                       struct stat st;
+                                       uint64_t desc;
+                                       uint64_t npairs;
+                                       uint64_t hsize;
+                                       /* Read the 8 byte long strimp
+                                        * descriptor and make sure that
+                                        * the number of pairs is either
+                                        * 32 or 64.
+                                        */
+                                       if (read(fd, &desc, 8) == 8
+                                           && (desc & 0xff) == STRIMP_VERSION
+                                           && (((npairs = (desc & (0xff << 8)) 
>> 8) == 32) || npairs == 64)
+                                           && (hsize = (desc & (0xffff << 16)) 
>> 16) >= 96 && hsize <= 640
+                                           && fstat(fd, &st) == 0
+                                           && st.st_size >= (off_t) (8 + hsize 
+ BATcount(b)*(npairs > 32 ? 8 : 4))
+#ifdef PERSISTENT_STRIMP
+                                           && ((desc & (0xff << 32)) >> 32) == 
1
+#endif
+                                           && HEAPload(&hp->strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
+                                               /* offsets are either 1
+                                                * or 2 bytes long. This
+                                                * allows for pairs that
+                                                * are 8 bytes long (max
+                                                * utf-8 encoding).
+                                                */
+                                               uint8_t awidth = npairs > 32 ? 
2 : 1;
+
+                                               hp->offsets_base = hp + 8; /* 
offsets start just after the descriptor */
+                                               hp->pairs_base = (uint8_t *)(hp 
+ 8) + npairs*awidth; /* pairs start after the offsets */
+                                               hp->strimps_base = (hp + 8) + 
hsize; /* strimps start after the pairs */
+
+                                               close(fd);
+                                               hp->strimps.parentid = 
b->batCacheid;
+                                               b->tstrimps = hp;
+                                               TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
+                                               MT_lock_unset(&b->batIdxLock);
+                                               return true;
+                                       }
+                                       close(fd);
+                                       /* unlink unusable file */
+                                       GDKunlink(hp->strimps.farmid, BATDIR, 
nme, "tstrimp");
+
+                               }
+                       }
+                       GDKfree(hp);
+                       GDKclrerr();    /* we're not currently interested in 
errors */
+               }
+               MT_lock_unset(&b->batIdxLock);
+       }
+       ret = b->tstrimps != NULL;
+       if (ret)
+               TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): 
already has strimps, waited " LLFMT " usec\n", ALGOBATPAR(b), GDKusec() - t);
+
+       return false; // is this correct?
+}
+
+/* Filter a BAT b using a string q. Return the result as a candidate
+ * list.
+ */
+BAT *
+STRMPfilter(BAT *b, char *q)
+{
+       BAT *r = NULL;
+       BUN i;
+       StrimpHeader *hd = NULL;
+       uint64_t qbmask;
+       uint64_t *ptr;
+
+
+       if (b->tstrimps == NULL)
+               goto sfilter_fail;
+
+       r = COLnew(0, TYPE_oid, b->batCount, TRANSIENT);
+       if (r == NULL) {
+               goto sfilter_fail;
+       }
+
+       if (!BATcheckstrimps(b)) {
+               goto sfilter_fail;
+       }
+       qbmask = STRMPmakebitstring(q, hd);
+       ptr = (uint64_t *)b->tstrimps->strimps.base + STRIMP_OFFSET * 
sizeof(uint64_t);
+
+
+       for (i = 0; i < b->batCount; i++) {
+               if ((*ptr & qbmask) == qbmask) {
+                       oid pos = i;
+                       if (BUNappend(r, &pos, false) != GDK_SUCCEED)
+                               goto sfilter_fail;  // have not checked 
everything here
+               }
+       }
+
+       return virtualize(r);
+
+
+ sfilter_fail:
+       BBPunfix(r->batCacheid);
+       free(hd);
+       return NULL;
+}
+
 /* Create */
 gdk_return
 STRMPcreate(BAT *b)
@@ -386,22 +506,22 @@ STRMPcreate(BAT *b)
        BUN i;
        str s;
        StrimpHeader *head;
-       Heap *h;
+       Strimps *h;
        uint64_t *dh;
 
        assert(b->ttype == TYPE_str);
-       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+       TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
 
        if (b->tstrimps == NULL) {
                if ((head = create_header(b)) == NULL) {
                        return GDK_FAIL;
                }
 
-               if ((h = create_strimp_heap(b, head)) == NULL) {
+               if ((h = create_strimp(b, head)) == NULL) {
                        GDKfree(head);
                        return GDK_FAIL;
                }
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to