Changeset: d0711db453cd for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d0711db453cd
Modified Files:
        gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Add some documentation


diffs (105 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -6,6 +6,40 @@
  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
  */
 
+
+/* A string imprint is an index that can be used as a prefilter in LIKE
+ * queries. It has 2 components:
+ *
+ * - a header of 64 string element pairs (bytes in the current
+ *   implementation but maybe unicode chars might make more sense).
+ *
+ * - a 64 bit mask for each item in the BAT that encodes the presence or
+ *   absence of each element of the header in the specific item.
+ *
+ * A string imprint is stored in a new Heap in the BAT.
+ *
+ * In the current (byte pair) implementation the first 136 bytes
+ * (i.e. the first 17 64 bit quantities) in the Heap are as follows:
+ *
+ * |                       Version Number                      |   -----
+ * | byte pair 01 | byte pair 02 | byte pair 03 | byte pair 04 |     |
+ * | byte pair 05 | byte pair 06 | byte pair 07 | byte pair 08 |     |  17 64 
bit quantities
+ * [...]                                                             |
+ * | byte pair 61 | byte pair 62 | byte pair 63 | byte pair 64 |   -----
+ *
+ * The bitmasks for each string in the BAT follow after this.
+ *
+ * Strimp creation goes as follows:
+ *
+ * - Construct a histogram of the element (byte or character) pairs for
+ *   all the strings in the BAT.
+ *
+ * - Take the 64 most frequent pairs as the Strimp Header.
+ *
+ * - For each string in the bat construct a 64 bit mask that encodes the
+ *   presence or absence of each member of the header in the string.
+ */
+
 #include "monetdb_config.h"
 #include "gdk.h"
 #include "gdk_private.h"
@@ -13,33 +47,35 @@
 /* This counts how many unicode codepoints the given string
  * contains.
  */
-/* static size_t */
-/* GDKstrimp_strlen(const uint8_t *s) */
-/* { */
-/*     size_t ret = 0; */
-/*     size_t i; */
-/*     int m,n; */
-/*     uint8_t c; */
+#if 0
+static size_t
+GDKstrimp_strlen(const uint8_t *s)
+{
+       size_t ret = 0;
+       size_t i;
+       int m,n;
+       uint8_t c;
 
-/*     i = 0; */
-/*     while((c = *(s + i)) != 0) { */
-/*             if (c < 0x80) */
-/*                     i++; */
-/*             else { */
-/*                     for (n = 0, m=0x40; c & m; n++, m >>= 1) */
-/*                             ; */
-/*                     /\* n is now the number of 10xxxxxx bytes that should */
-/*                        follow. *\/ */
-/*                     if (n == 0 || n >= 4) */
-/*                             /\* TODO: handle invalid utf-8 *\/ */
-/*                             {} */
-/*                     i += n+1; */
-/*             } */
-/*             ret++; */
-/*     } */
+       i = 0;
+       while((c = *(s + i)) != 0) {
+               if (c < 0x80)
+                       i++;
+               else {
+                       for (n = 0, m=0x40; c & m; n++, m >>= 1)
+                               ;
+                       /* n is now the number of 10xxxxxx bytes that should
+                          follow. */
+                       if (n == 0 || n >= 4)
+                               /* TODO: handle invalid utf-8 */
+                               {}
+                       i += n+1;
+               }
+               ret++;
+       }
 
-/*     return ret; */
-/* } */
+       return ret;
+}
+#endif
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to