Changeset: 05018afa3a3f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/05018afa3a3f
Modified Files:
        gdk/gdk_strimps.c
        monetdb5/modules/mal/pcre.c
Branch: strimps_update
Log Message:

First draft of parallel construction


diffs (truncated from 377 to 300 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -403,76 +403,69 @@ BATcheckstrimps(BAT *b)
                return false;
 
        assert(b->batCacheid > 0);
+
        if (b->tstrimps == (Strimps *)1) {
-               assert(!GDKinmemory(b->theap->farmid));
-               MT_lock_set(&b->batIdxLock);
-               if (b->tstrimps == (Strimps *)1) {
-                       Strimps *hp;
-                       const char *nme = BBP_physical(b->batCacheid);
-                       int fd;
+               Strimps *hp;
+               const char *nme = BBP_physical(b->batCacheid);
+               int fd;
 
-                       b->tstrimps = NULL;
-                       if ((hp = GDKzalloc(sizeof(Strimps))) != NULL &&
-                           (hp->strimps.farmid = BBPselectfarm(b->batRole, 
b->ttype, strimpheap)) >= 0) {
-                               strconcat_len(hp->strimps.filename,
-                                             sizeof(hp->strimps.filename),
-                                             nme, ".tstrimps", NULL);
+               b->tstrimps = NULL;
+               if ((hp = GDKzalloc(sizeof(Strimps))) != NULL &&
+                   (hp->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, 
strimpheap)) >= 0) {
+                       strconcat_len(hp->strimps.filename,
+                                     sizeof(hp->strimps.filename),
+                                     nme, ".tstrimps", NULL);
 
-                               /* check whether a persisted strimp can be 
found */
-                               if ((fd = GDKfdlocate(hp->strimps.farmid, nme, 
"rb+", "tstrimps")) >= 0) {
-                                       struct stat st;
-                                       uint64_t desc;
-                                       size_t npairs;
-                                       size_t hsize;
-                                       /* Read the 8 byte long strimp
-                                        * descriptor.
-                                        *
-                                        * NPAIRS must be 64 in the
-                                        * current implementation.
-                                        *
-                                        * HSIZE must be between 200 and
-                                        * 584 (inclusive): 8 bytes the
-                                        * descritor, 64 bytes the pair
-                                        * sizes and n*64 bytes the
-                                        * actual pairs where 2 <= n <=
-                                        * 8.
-                                        */
-                                       if (read(fd, &desc, 8) == 8
-                                           && (desc & 0xff) == STRIMP_VERSION
-                                           && ((npairs = NPAIRS(desc)) == 
STRIMP_PAIRS)
-                                           && (hsize = HSIZE(desc)) >= 200 && 
hsize <= 584
-                                           && ((desc >> 32) & 0xff) == 1 /* 
check the persistence byte */
-                                           && fstat(fd, &st) == 0
-                                           /* TODO: We might need padding in 
the UTF-8 case. */
-                                           && st.st_size >= (off_t) 
(hp->strimps.free = hp->strimps.size =
-                                                                     /* header 
size (desc + offsets + pairs) */
-                                                                     hsize +
-                                                                     /* 
bitmasks */
-                                                                     
BATcount(b)*sizeof(uint64_t))
-                                           && HEAPload(&hp->strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
-                                               hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
-                                               hp->pairs_base = hp->sizes_base 
+ STRIMP_HEADER_SIZE;   /* pairs just after the offsets. */
-                                               hp->bitstrings_base = 
hp->strimps.base + hsize;   /* bitmasks just after the pairs */
+                       /* check whether a persisted strimp can be found */
+                       if ((fd = GDKfdlocate(hp->strimps.farmid, nme, "rb+", 
"tstrimps")) >= 0) {
+                               struct stat st;
+                               uint64_t desc;
+                               size_t npairs;
+                               size_t hsize;
+                               /* Read the 8 byte long strimp
+                                * descriptor.
+                                *
+                                * HSIZE must be between 200 and
+                                * 584 (inclusive): 8 bytes the
+                                * descritor, 64 bytes the pair
+                                * sizes and n*64 bytes the
+                                * actual pairs where 2 <= n <=
+                                * 8.
+                                */
+                               if (read(fd, &desc, 8) == 8
+                                   && (desc & 0xff) == STRIMP_VERSION
+                                   && ((npairs = NPAIRS(desc)) == STRIMP_PAIRS)
+                                   && (hsize = HSIZE(desc)) >= 200 && hsize <= 
584
+                                   && ((desc >> 32) & 0xff) == 1 /* check the 
persistence byte */
+                                   && fstat(fd, &st) == 0
+                                   /* TODO: We might need padding in the UTF-8 
case. */
+                                   && st.st_size >= (off_t) (hp->strimps.free 
= hp->strimps.size =
+                                                             /* header size 
(desc + offsets + pairs) */
+                                                             hsize +
+                                                             /* bitmasks */
+                                                             
BATcount(b)*sizeof(uint64_t))
+                                   && HEAPload(&hp->strimps, nme, "tstrimps", 
false) == GDK_SUCCEED) {
+                                       hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
+                                       hp->pairs_base = hp->sizes_base + 
STRIMP_HEADER_SIZE;   /* pairs just after the offsets. */
+                                       hp->bitstrings_base = hp->strimps.base 
+ hsize;   /* bitmasks just after the pairs */
 
-                                               close(fd);
-                                               ATOMIC_INIT(&hp->strimps.refs, 
1);
-                                               // STRMPincref(hp);
-                                               hp->strimps.parentid = 
b->batCacheid;
-                                               b->tstrimps = hp;
-                                               TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
-                                               MT_lock_unset(&b->batIdxLock);
-                                               return true;
-                                       }
                                        close(fd);
-                                       /* unlink unusable file */
-                                       GDKunlink(hp->strimps.farmid, BATDIR, 
nme, "tstrimps");
+                                       ATOMIC_INIT(&hp->strimps.refs, 1);
+                                       // STRMPincref(hp);
+                                       hp->strimps.parentid = b->batCacheid;
+                                       b->tstrimps = hp;
+                                       TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
+                                       MT_lock_unset(&b->batIdxLock);
+                                       return true;
+                               }
+                               close(fd);
+                               /* unlink unusable file */
+                               GDKunlink(hp->strimps.farmid, BATDIR, nme, 
"tstrimps");
 
-                               }
                        }
-                       GDKfree(hp);
-                       GDKclrerr();    /* we're not currently interested in 
errors */
                }
-               MT_lock_unset(&b->batIdxLock);
+               GDKfree(hp);
+               GDKclrerr();    /* we're not currently interested in errors */
        }
        /* The string imprint is initialized if the strimp pointer is
         * not null and the number of bitstrings is equal to the bat
@@ -572,9 +565,9 @@ STRMPfilter(BAT *b, BAT *s, const char *
 }
 
 static void
-BATstrimpsync(void *arg)
+BATstrimpsync(BAT *b)
 {
-       BAT *b = arg;
+       // BAT *b = arg;
        lng t0 = 0;
        Heap *hp;
        int fd;
@@ -582,7 +575,7 @@ BATstrimpsync(void *arg)
 
        TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
 
-       MT_lock_set(&b->batIdxLock);
+       // MT_lock_set(&b->batIdxLock);
        if ((hp = &b->tstrimps->strimps)) {
                if (HEAPsave(hp, hp->filename, NULL, true, hp->free, NULL) == 
GDK_SUCCEED) {
                        if (hp->storage == STORE_MEM) {
@@ -620,7 +613,7 @@ BATstrimpsync(void *arg)
                                  BATgetId(b), GDKusec() - t0, failed);
                }
        }
-       MT_lock_unset(&b->batIdxLock);
+       // MT_lock_unset(&b->batIdxLock);
        BBPunfix(b->batCacheid);
 }
 
@@ -631,13 +624,14 @@ persistStrimp(BAT *b)
           && b->batInserted == b->batCount
           && !b->theap->dirty
           && !GDKinmemory(b->theap->farmid)) {
-               MT_Id tid;
+               // MT_Id tid;
                BBPfix(b->batCacheid);
                char name[MT_NAME_LEN];
                snprintf(name, sizeof(name), "strimpsync%d", b->batCacheid);
-               if (MT_create_thread(&tid, BATstrimpsync, b,
-                                    MT_THR_DETACHED, name) < 0)
-                       BBPunfix(b->batCacheid);
+               BATstrimpsync(b);
+               /* if (MT_create_thread(&tid, BATstrimpsync, b, */
+               /*                   MT_THR_DETACHED, name) < 0) */
+               /* BBPunfix(b->batCacheid); */
        } else
                TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT 
persisting strimp\n", ALGOBATPAR(b));
 }
@@ -709,6 +703,7 @@ bool
 BAThasstrimps(BAT *b)
 {
        BAT *pb;
+       bool ret;
        if (VIEWtparent(b)) {
                pb = BBP_cache(VIEWtparent(b));
                assert(pb);
@@ -716,18 +711,39 @@ BAThasstrimps(BAT *b)
                pb = b;
        }
 
-       return BATcheckstrimps(pb);
+       MT_lock_set(&pb->batIdxLock);
+       ret = BATcheckstrimps(pb);
+       MT_lock_unset(&pb->batIdxLock);
+
+       return ret;
 
 }
+/* This macro takes a bat and checks if the strimp construction has been
+ * completed. It is completed when the strimp pointer is not null and it
+ * is either 1 (i.e. it exists on disk) or the number of bitstrings
+ * computed is the same as the number of elements in the BAT.
+ */
+#define STRIMP_COMPLETE(b)                                             \
+       b->tstrimps != NULL &&                                          \
+               ((b->tstrimps->strimps.free - ((char 
*)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base)) == 
b->batCount*sizeof(uint64_t))
+
+
 
 gdk_return
 STRMPcreate(BAT *b, BAT *s)
 {
        lng t0 = 0;
        BAT *pb;
+       Strimps *r = NULL;
+       BATiter bi;
+       BUN i;
+       oid x;
+       struct canditer ci;
+       uint64_t *dh;
 
        MT_thread_setalgorithm("create strimp index");
        TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
+       TRC_DEBUG(ACCELERATOR, "creating strimp");
        if (ATOMstorage(b->ttype) != TYPE_str) {
                GDKerror("Cannot create strimps index for non string bats\n");
                return GDK_FAIL;
@@ -740,11 +756,52 @@ STRMPcreate(BAT *b, BAT *s)
                pb = b;
        }
 
-       if (BATcheckstrimps(pb)) {
-               return GDK_SUCCEED;
+       if (pb->tstrimps == NULL || pb->tstrimps == (Strimps*)1) {
+               MT_lock_set(&pb->batIdxLock);
+               if (pb->tstrimps == NULL || pb->tstrimps == (Strimps*)1) {
+                       if (BATcheckstrimps(pb)) {
+                               MT_lock_unset(&b->batIdxLock);
+                               return GDK_SUCCEED;
+                       }
+
+                       assert(pb->tstrimps == NULL);
+
+                       if ((r = STRMPcreateStrimpHeap(pb, s)) == NULL) {
+                               MT_lock_unset(&pb->batIdxLock);
+                               return GDK_FAIL;
+                       }
+                       pb->tstrimps = r;
+               }
+               MT_lock_unset(&pb->batIdxLock);
        }
 
-       if (pb->tstrimps == NULL) {
+       r = pb->tstrimps;
+       assert(r);
+       dh = (uint64_t *)r->bitstrings_base + b->hseqbase;
+       canditer_init(&ci, b, s);
+
+       bi = bat_iterator(b);
+       for (i = 0; i < ci.ncand; i++) {
+               x = canditer_next(&ci) - b->hseqbase;
+               const char *cs = BUNtvar(bi, x);
+               if (!strNil(cs))
+                       *dh++ = STRMPmakebitstring(cs, r);
+               else
+                       *dh++ = (uint64_t)0x1 << (STRIMP_PAIRS); /* Encode NULL 
strings in the most significant bit */
+       }
+       bat_iterator_end(&bi);
+
+       MT_lock_set(&b->batIdxLock);
+       r->strimps.free += b->batCount*sizeof(uint64_t);
+       /* The thread that reaches this point last needs to write the strimp to 
disk. */
+       if ((r->strimps.free - ((char *)r->bitstrings_base - r->strimps.base)) 
== b->batCount*sizeof(uint64_t)) {
+               persistStrimp(pb);
+       }
+       MT_lock_unset(&b->batIdxLock);
+
+
+       /*
+         if (pb->tstrimps == NULL) {
                MT_lock_set(&pb->batIdxLock);
                if (pb->tstrimps == NULL) {
                        Strimps *r;
@@ -760,7 +817,6 @@ STRMPcreate(BAT *b, BAT *s)
                        }
                        dh = (uint64_t *)r->bitstrings_base;
 
-                       /* Compute bitstrings */
                        canditer_init(&ci, pb, NULL);
                        bi = bat_iterator(pb);
                        for (i = 0; i < ci.ncand; i++) {
@@ -771,7 +827,7 @@ STRMPcreate(BAT *b, BAT *s)
                                        assert((*(dh - 1) & ((uint64_t)0x1 << 
(STRIMP_PAIRS))) == 0);
                                }
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to