Changeset: 54c5414c518a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/54c5414c518a
Branch: default
Log Message:

Merge with branch strimps-updates


diffs (truncated from 680 to 300 lines):

diff --git a/ChangeLog.strimps-updates b/ChangeLog.strimps-updates
new file mode 100644
--- /dev/null
+++ b/ChangeLog.strimps-updates
@@ -0,0 +1,10 @@
+# ChangeLog file for strimpsv2
+# This file is updated with Maddlog
+
+* Mon Jul 17 2023 Panagiotis Koutsourakis <kutsu...@monetdbsolutions.com>
+- Implemented direct masking for strimp construction. The strimps
+  datastructure now keeps an array of 65K 64-bit integers that is zero
+  everywhere except at the indexes that correspond to header pairs. The
+  entry for the nth pair in order has the nth bit of the bitstring
+  on. These can be used to quickly construct bitstrings.
+
diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt
--- a/gdk/CMakeLists.txt
+++ b/gdk/CMakeLists.txt
@@ -89,7 +89,7 @@ target_sources(bat
   gdk_analytic.h
   gdk_tracer.c gdk_tracer.h
   gdk_rtree.c
-  gdk_strimps.c gdk_strimps.h
+  gdk_strimps.c
   PUBLIC
   ${gdk_public_headers})
 
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -456,15 +456,20 @@ struct Imprints {
        BUN dictcnt;            /* counter for cache dictionary               */
 };
 
+typedef uint64_t strimp_masks_t;  /* TODO: make this a sparse matrix */
+
 struct Strimps {
        Heap strimps;
        uint8_t *sizes_base;    /* pointer into strimps heap (pair sizes)  */
        uint8_t *pairs_base;    /* pointer into strimps heap (pairs start)   */
-       void *bitstrings_base;  /* pointer into strimps heap (bitstrings start) 
*/
+       void *bitstrings_base;  /* pointer into strimps heap (bitstrings
+                                * start) bitstrings_base is a pointer
+                                * to uint64_t */
        size_t rec_cnt;         /* reconstruction counter: how many
-                                  bitstrings were added after header
-                                  construction */
-       /* bitstrings_base is a pointer to uint64_t */
+                                * bitstrings were added after header
+                                * construction. Currently unused. */
+       strimp_masks_t *masks;  /* quick access to masks for
+                                * bitstring construction */
 };
 
 #ifdef HAVE_RTREE
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -85,7 +85,38 @@
 #include "gdk.h"
 #include "gdk_private.h"
 
-#include "gdk_strimps.h"
+#include <stdint.h>
+#include <inttypes.h>
+
+#define STRIMP_VERSION (uint64_t)2
+#define STRIMP_HISTSIZE (256*256)
+#define STRIMP_HEADER_SIZE 64
+#define STRIMP_PAIRS (STRIMP_HEADER_SIZE - 1)
+#define STRIMP_CREATION_THRESHOLD                              \
+       ((BUN) ((ATOMIC_GET(&GDKdebug) & FORCEMITOMASK)? 100 : 5000))
+
+typedef struct {
+#ifdef UTF8STRIMPS
+       uint8_t *pbytes;
+#else
+       uint8_t pbytes[2];
+#endif //UTF8STRIMPSX
+       uint8_t psize;
+       size_t idx;
+       uint64_t mask;
+} CharPair;
+
+typedef struct {
+       size_t pos;
+       size_t lim;
+       const char *s;
+} PairIterator;
+
+typedef struct {
+       uint64_t cnt;
+       uint64_t mask;
+       size_t idx;
+} PairHistogramElem;
 
 
 /* Macros for accessing metadada of a strimp. These are recorded in the
@@ -122,20 +153,10 @@ pair_equal(CharPair *p1, CharPair *p2) {
 #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
 #define pairToIndex(b1, b2) (size_t)(((uint16_t)b2)<<8 | ((uint16_t)b1))
 
-inline static bool
-pair_equal(CharPair *p1, CharPair *p2)
-{
-       return p1->pbytes[0] == p2->pbytes[0] &&
-               p1->pbytes[1] == p2->pbytes[1];
-
-}
-
 inline static size_t
-histogram_index(PairHistogramElem *hist, size_t hsize, CharPair *p)
-{
-       (void) hist;
-       (void) hsize;
-       return pairToIndex(p->pbytes[0], p->pbytes[1]);
+bytes2histindex(uint8_t *bytes, uint8_t psize) {
+       (void)psize;
+       return pairToIndex(bytes[0], bytes[1]);
 }
 
 inline static bool
@@ -143,8 +164,12 @@ pair_at(PairIterator *pi, CharPair *p)
 {
        if (pi->pos >= pi->lim - 1)
                return false;
-       p->pbytes = (uint8_t*)pi->s + pi->pos;
+
+       p->pbytes[0] = (uint8_t)tolower(*(pi->s + pi->pos));
+       p->pbytes[1] = (uint8_t)tolower(*(pi->s + pi->pos + 1));
+
        p->psize = 2;
+       p->idx = pairToIndex(p->pbytes[0], p->pbytes[1]);
        return true;
 }
 
@@ -166,35 +191,31 @@ ignored(CharPair *p, uint8_t elm)
        return isIgnored(p->pbytes[elm]);
 }
 
-#endif // UTF8STRIMPS
+static strimp_masks_t
+STRMPget_mask(Strimps *r, uint64_t idx)
+{
+       return r->masks[idx];
+}
 
-/* Looks up a given pair in the strimp header. Returns the index of the
- * pair, or -1 if it is not found.
- *
- * NOTE: This routine assumes that there are no more than 128 pairs.
+static void
+STRMPset_mask(Strimps *r, uint64_t idx, strimp_masks_t val)
+{
+       r->masks[idx] = val;
+}
+
+/* Looks up a given pair in the strimp header and returns the index as a
+ * 64 bit integer. The return value has a bit on in the position
+ * corresponding to the index of the pair in the strimp header, or is 0
+ * if the pair does not occur.
  */
-static int8_t
+static uint64_t
 STRMPpairLookup(Strimps *s, CharPair *p)
 {
-       size_t idx = 0;
-       size_t npairs = NPAIRS(((uint64_t *)s->strimps.base)[0]);
-       size_t offset = 0;
-       CharPair sp;
-
-       // The return type implies that we have no more than 128 pairs
-       // in the header.
-       assert(npairs <= 128);
+       return STRMPget_mask(s, pairToIndex(p->pbytes[0], p->pbytes[1]));
+}
 
-       for (idx = 0; idx < npairs; idx++) {
-               sp.psize = s->sizes_base[idx];
-               sp.pbytes = s->pairs_base + offset;
-               if (pair_equal(&sp, p))
-                       return (int8_t)idx;
-               offset += sp.psize;
-       }
 
-       return -1;
-}
+#endif // UTF8STRIMPS
 
 
 /* Computes the bitstring of a string s with respect to the strimp r.
@@ -204,7 +225,7 @@ static uint64_t
 STRMPmakebitstring(const char *s, Strimps *r)
 {
        uint64_t ret = 0;
-       int8_t pair_idx = 0;
+       /* int8_t pair_idx = 0; */
        PairIterator pi;
        CharPair cp;
 
@@ -217,9 +238,7 @@ STRMPmakebitstring(const char *s, Strimp
        }
 
        while(pair_at(&pi, &cp)) {
-               pair_idx = STRMPpairLookup(r, &cp);
-               if (pair_idx >= 0)
-                       ret |= ((uint64_t)0x1 << pair_idx);
+               ret |= STRMPpairLookup(r, &cp);
                next_pair(&pi);
        }
 
@@ -233,24 +252,8 @@ STRMPmakebitstring(const char *s, Strimp
                ((TPE *) _a)[_j] = _t;                  \
        } while(0)
 
-/* Finds the indices of the STRIMP_HEADER_SIZE largest counts in a given
- * a histogram. It returns them in the cp pointer.
- *
- * We make one scan of histogram and every time we find a count that is
- * greater than the current minimum of the STRIMP_HEADER_SIZE, we bubble
- * it up in the header until we find a count that is greater. We carry
- * the index in the histogram because this is the information we are
- * actually interested in keeping.
- *
- * At the end of this process we have the indices of STRIMP_HEADER_SIZE
- * largest counts in the histogram. This process is O(n) in time since
- * we are doing constant work (at most STRIMP_HEADER_SIZE-1 comparisons
- * and swaps) for each item in the histogram and as such is
- * (theoretically) more efficient than sorting (O(nlog n))and taking the
- * STRIMP_HEADER_SIZE largest elements. This depends on the size of the
- * histogram n. For some small n sorting might be more efficient, but
- * for such inputs the difference should not be noticeable.
- */
+
+
 static void
 STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp)
 {
@@ -275,8 +278,11 @@ STRMPchoosePairs(PairHistogramElem *hist
        }
 
        for(i = 0; i < STRIMP_HEADER_SIZE; i++) {
-               cp[i].pbytes = hist[indices[i]].p->pbytes;
-               cp[i].psize = hist[indices[i]].p->psize;
+               cp[i].pbytes[1] = (uint8_t)(hist[indices[i]].idx & 0xFF);
+               cp[i].pbytes[0] = (uint8_t)((hist[indices[i]].idx >> 8) & 0xFF);
+               cp[i].idx = hist[indices[i]].idx;
+               cp[i].psize = 2;
+               cp[i].mask = ((uint64_t)0x1) << (STRIMP_PAIRS - i - 1);
        }
 
        TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
@@ -313,10 +319,16 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
        }
 
        hlen = STRIMP_HISTSIZE;
-       if ((hist = (PairHistogramElem 
*)GDKzalloc(hlen*sizeof(PairHistogramElem))) == NULL) {
+       if ((hist = (PairHistogramElem 
*)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) {
                return false;
        }
 
+       for(hidx = 0; hidx < hlen; hidx++) {
+               hist[hidx].cnt = 0;
+               hist[hidx].mask = 0;
+               hist[hidx].idx = hidx;
+       }
+
        // Create Histogram
        bi = bat_iterator(b);
        pip = &pi;
@@ -349,7 +361,8 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
                                        ;
 
                                } else {
-                                       hidx = histogram_index(hist, hlen, cpp);
+                                       /* hidx = histogram_index(hist, hlen, 
cpp); */
+                                       hidx = cpp->idx;
 #ifndef UTF8STRINGS
                                        assert(hidx < hlen);
 #else
@@ -358,20 +371,9 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
                                                continue;
                                        }
 #endif
-                                       hist[hidx].cnt++;
-                                       if (hist[hidx].p == NULL) {
+                                       if (!hist[hidx].cnt)
                                                values++;
-                                               hist[hidx].p = (CharPair 
*)GDKmalloc(sizeof(CharPair));
-                                               if (!hist[hidx].p) {
-                                                       bat_iterator_end(&bi);
-                                                       for (hidx = 0; hidx < 
hlen; hidx++)
-                                                               
GDKfree(hist[hidx].p);
-                                                       GDKfree(hist);
-                                                       return false;
-                                               }
-                                               hist[hidx].p->psize = 
cpp->psize;
-                                               hist[hidx].p->pbytes = 
cpp->pbytes;
-                                       }
+                                       hist[hidx].cnt++;
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to