MonetDB: string_imprints - Read and write the descriptor correctly

2021-06-03 Thread Panagiotis Koutsourakis
Changeset: a2c6fcd81f79 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/a2c6fcd81f79
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Read and write the descriptor correctly


diffs (130 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -138,7 +138,7 @@ static int8_t
 strimp_lookup(Strimps *s, CharPair *p) {
int8_t ret = -1;
size_t idx = 0;
-   size_t npairs = NPAIRS((uint64_t)s->strimps.base[0]);
+   size_t npairs = NPAIRS(((uint64_t *)s->strimps.base)[0]);
size_t offset = 0;
CharPair sp;
(void)p;
@@ -181,7 +181,7 @@ STRMPmakebitstring(const str s, Strimps 
pi.lim = strlen(s);
 
while(pair_at(, )) {
-   pair_idx = strimp_lookup(r, );
+   pair_idx = STRMPpairLookup(r, );
if (pair_idx > 0)
ret |= 0x1 << pair_idx;
next_pair();
@@ -190,8 +190,6 @@ STRMPmakebitstring(const str s, Strimps 
return ret;
 }
 
-
-
 /* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest
  * counts.
  *
@@ -321,6 +319,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
if(hist[hidx].p) {
GDKfree(hist[hidx].p->pbytes);
GDKfree(hist[hidx].p);
+   hist[hidx].p = NULL;
}
}
GDKfree(hist);
@@ -333,7 +332,6 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
 static Strimps *
 STRMPcreateStrimp(BAT *b)
 {
-   uint64_t *d;
uint8_t *h1, *h2;
Strimps *r = NULL;
uint64_t descriptor;
@@ -354,16 +352,15 @@ STRMPcreateStrimp(BAT *b)
if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
(r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, 
strimpheap)) < 0 ||
strconcat_len(r->strimps.filename, sizeof(r->strimps.filename),
- nme, ".strimp", NULL) >= sizeof(r->strimps.filename) 
||
+ nme, ".tstrimps", NULL) >= 
sizeof(r->strimps.filename) ||
HEAPalloc(>strimps, BATcount(b)*sizeof(uint64_t) + sz, 
sizeof(uint8_t), 0) != GDK_SUCCEED) {
GDKfree(r);
return NULL;
}
 
-   descriptor =  STRIMP_VERSION | STRIMP_HEADER_SIZE << 8 | ((uint64_t)sz) 
<< 16;
+   descriptor =  STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) << 8 | 
((uint64_t)sz) << 16;
 
-   d = (uint64_t *)r->strimps.base;
-   *d = descriptor;
+   ((uint64_t *)r->strimps.base)[0] = descriptor;
r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8;
r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE;
 
@@ -505,7 +502,6 @@ persistStrimp(BAT *b)
if((BBP_status(b->batCacheid) & BBPEXISTING) &&
   b->batInserted == b->batCount)
 }
-#endif
 
 /* Create */
 gdk_return
@@ -521,31 +517,32 @@ STRMPcreate(BAT *b)
assert(b->ttype == TYPE_str);
TRC_DEBUG_IF(ALGO) t0 = GDKusec();
 
-   if (b->tstrimps == NULL) {
-   if ((h = STRMPcreateStrimp(b)) == NULL) {
-   return GDK_FAIL;
-   }
-   dh = (uint64_t *)h->strimps.base + h->strimps.free;
+   if (BATcheckstrimps(b))
+   return GDK_SUCCEED;
+
+   if ((h = STRMPcreateStrimp(b)) == NULL) {
+   return GDK_FAIL;
+   }
+   dh = (uint64_t *)h->strimps.base + h->strimps.free;
 
-   bi = bat_iterator(b);
-   for (i = 0; i < b->batCount; i++) {
-   s = (str)BUNtvar(bi, i);
-   if (!strNil(s))
-   *dh++ = STRMPmakebitstring(s, h);
-   else
-   *dh++ = 0; /* no pairs in nil values */
-   }
-   h->strimps.free += b->batCount*sizeof(uint64_t);
+   bi = bat_iterator(b);
+   for (i = 0; i < b->batCount; i++) {
+   s = (str)BUNtvar(bi, i);
+   if (!strNil(s))
+   *dh++ = STRMPmakebitstring(s, h);
+   else
+   *dh++ = 0; /* no pairs in nil values */
+   }
+   h->strimps.free += b->batCount*sizeof(uint64_t);
 
-   /* After we have computed the strimp, attempt to write it back
-* to the BAT.
-*/
-   MT_lock_set(>batIdxLock);
-   b->tstrimps = h;
-   b->batDirtydesc = true;
-   /* persistStrimp(b) */
-   MT_lock_unset(>batIdxLock);
-   }
+   /* After we have computed the strimp, attempt to write it back
+* to the BAT.
+*/
+   MT_lock_set(>batIdxLock);
+   b->tstrimps = h;
+   b->batDirtydesc = true;
+   persistStrimp(b);
+   MT_lock_unset(>batIdxLock);
 
TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0);
return GDK_SUCCEED;

MonetDB: Jul2021 - Merge heads

2021-07-07 Thread Panagiotis Koutsourakis
Changeset: 2f44594a914e for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/2f44594a914e
Modified Files:
sql/server/sql_scan.c
Branch: Jul2021
Log Message:

Merge heads


diffs (truncated from 5186 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -9170,6 +9170,7 @@ stdout of test 'MAL-signatures` in direc
 [ "rapi",  "eval_aggr","pattern rapi.eval_aggr(X_0:ptr, X_1:str, 
X_2:any...):any... ", "RAPIevalAggr;",""  ]
 [ "rapi",  "prelude",  "command rapi.prelude():void ", "RAPIprelude;", 
""  ]
 [ "rapi",  "subeval_aggr", "pattern rapi.subeval_aggr(X_0:ptr, X_1:str, 
X_2:any...):any... ",  "RAPIevalAggr;",""  ]
+[ "remote","assert",   "pattern remote.assert(X_0:bit, X_1:str):void 
","RMTassert;",   ""  ]
 [ "remote","batbincopy",   "pattern remote.batbincopy():bat[:any] ",   
"RMTbincopyfrom;",  ""  ]
 [ "remote","batbincopy",   "pattern remote.batbincopy(X_0:bat[:any]):void 
",   "RMTbincopyto;",""  ]
 [ "remote","batload",  "pattern remote.batload(X_0:any_1, 
X_1:int):bat[:any_1] ",  "RMTbatload;",  ""  ]
@@ -9271,6 +9272,7 @@ stdout of test 'MAL-signatures` in direc
 [ "sql",   "deltas",   "pattern sql.deltas(X_0:str, X_1:str) 
(X_2:bat[:int], X_3:bat[:lng], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], 
X_7:bat[:lng], X_8:bat[:int]) ",  "mvc_delta_values;",""  ]
 [ "sql",   "deltas",   "pattern sql.deltas(X_0:str, X_1:str, X_2:str) 
(X_3:bat[:int], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], X_7:bat[:lng], 
X_8:bat[:lng], X_9:bat[:int]) ", "mvc_delta_values;",""  ]
 [ "sql",   "dense_rank",   "pattern sql.dense_rank(X_0:any_1, X_1:bit, 
X_2:bit):int ", "SQLdense_rank;",   ""  ]
+[ "sql",   "deregister",   "pattern sql.deregister():int ",
"RAstatementEnd;",  ""  ]
 [ "sql",   "diff", "pattern sql.diff(X_0:any_1):bit ", "SQLdiff;", 
""  ]
 [ "sql",   "diff", "pattern sql.diff(X_0:bit, X_1:any_1):bit ",
"SQLdiff;", ""  ]
 [ "sql",   "drop_hash","unsafe pattern sql.drop_hash(X_0:str, 
X_1:str):void ", "SQLdrop_hash;",""  ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128 
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -12471,6 +12471,7 @@ stdout of test 'MAL-signatures` in direc
 [ "rapi",  "eval_aggr","pattern rapi.eval_aggr(X_0:ptr, X_1:str, 
X_2:any...):any... ", "RAPIevalAggr;",""  ]
 [ "rapi",  "prelude",  "command rapi.prelude():void ", "RAPIprelude;", 
""  ]
 [ "rapi",  "subeval_aggr", "pattern rapi.subeval_aggr(X_0:ptr, X_1:str, 
X_2:any...):any... ",  "RAPIevalAggr;",""  ]
+[ "remote","assert",   "pattern remote.assert(X_0:bit, X_1:str):void 
","RMTassert;",   ""  ]
 [ "remote","batbincopy",   "pattern remote.batbincopy():bat[:any] ",   
"RMTbincopyfrom;",  ""  ]
 [ "remote","batbincopy",   "pattern remote.batbincopy(X_0:bat[:any]):void 
",   "RMTbincopyto;",""  ]
 [ "remote","batload",  "pattern remote.batload(X_0:any_1, 
X_1:int):bat[:any_1] ",  "RMTbatload;",  ""  ]
@@ -12577,6 +12578,7 @@ stdout of test 'MAL-signatures` in direc
 [ "sql",   "deltas",   "pattern sql.deltas(X_0:str, X_1:str) 
(X_2:bat[:int], X_3:bat[:lng], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], 
X_7:bat[:lng], X_8:bat[:int]) ",  "mvc_delta_values;",""  ]
 [ "sql",   "deltas",   "pattern sql.deltas(X_0:str, X_1:str, X_2:str) 
(X_3:bat[:int], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], X_7:bat[:lng], 
X_8:bat[:lng], X_9:bat[:int]) ", "mvc_delta_values;",""  ]
 [ "sql",   "dense_rank",   "pattern sql.dense_rank(X_0:any_1, X_1:bit, 
X_2:bit):int ", "SQLdense_rank;",   ""  ]
+[ "sql",   "deregister",   "pattern sql.deregister():int ",
"RAstatementEnd;",  ""  ]
 [ "sql",   "diff", "pattern sql.diff(X_0:any_1):bit ", "SQLdiff;", 
""  ]
 [ "sql",   "diff", "pattern sql.diff(X_0:bit, X_1:any_1):bit ",
"SQLdiff;", ""  ]
 [ "sql",   "drop_hash","unsafe pattern sql.drop_hash(X_0:str, 
X_1:str):void ", "SQLdrop_hash;",""  ]
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -937,6 +937,7 @@ const char *deleteRef;
 void deleteSymbol(Module scope, Symbol prg);
 const char *deltaRef;
 const char *dense_rankRef;
+const char *deregisterRef;
 malType destinationType(MalBlkPtr mb, InstrPtr p);
 const char *diffRef;
 const char *diffcandRef;
diff --git 

MonetDB: Jul2021 - Avoid looking up raw_strings variable on ever...

2021-07-07 Thread Panagiotis Koutsourakis
Changeset: 119723724e8a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/119723724e8a
Modified Files:
sql/server/sql_scan.c
sql/server/sql_scan.h
Branch: Jul2021
Log Message:

Avoid looking up raw_strings variable on every read

We keep a bit in the sql scanner that controls whether it reads raw
strings or not. It is initialized using the raw_strings
setting/property.


diffs (149 lines):

diff --git a/sql/server/sql_scan.c b/sql/server/sql_scan.c
--- a/sql/server/sql_scan.c
+++ b/sql/server/sql_scan.c
@@ -528,6 +528,7 @@ scanner_init(struct scanner *s, bstream 
.rs = rs,
.ws = ws,
.mode = LINE_N,
+   .raw_string_mode = GDKgetenv_istrue("raw_strings"),
};
 }
 
@@ -985,11 +986,9 @@ int scanner_symbol(mvc * c, int cur)
return cur;
return tokenize(c, cur);
case '\'':
-#ifdef SQL_STRINGS_USE_ESCAPES
-   if (lc->next_string_is_raw || GDKgetenv_istrue("raw_strings"))
+   if (lc->raw_string_mode || lc->next_string_is_raw)
return scanner_string(c, cur, false);
return scanner_string(c, cur, true);
-#endif
case '"':
return scanner_string(c, cur, false);
case '{':
@@ -1272,9 +1271,7 @@ sql_get_next_token(YYSTYPE *yylval, void
if (token == IDENT || token == COMPARISON ||
token == RANK || token == aTYPE || token == ALIAS) {
yylval->sval = sa_strndup(c->sa, yylval->sval, 
lc->yycur-lc->yysval);
-#ifdef SQL_STRINGS_USE_ESCAPES
lc->next_string_is_raw = false;
-#endif
} else if (token == STRING) {
char quote = *yylval->sval;
char *str = sa_alloc( c->sa, (lc->yycur-lc->yysval-2)*2 + 1 );
@@ -1307,9 +1304,7 @@ sql_get_next_token(YYSTYPE *yylval, void
strcpy(str, yylval->sval + 3);
token = yylval->sval[2] == '\'' ? USTRING : UIDENT;
quote = yylval->sval[2];
-#ifdef SQL_STRINGS_USE_ESCAPES
lc->next_string_is_raw = true;
-#endif
break;
case 'x':
case 'X':
@@ -1321,9 +1316,7 @@ sql_get_next_token(YYSTYPE *yylval, void
*dst = 0;
quote = '\'';
token = XSTRING;
-#ifdef SQL_STRINGS_USE_ESCAPES
lc->next_string_is_raw = true;
-#endif
break;
case 'r':
case 'R':
@@ -1336,9 +1329,7 @@ sql_get_next_token(YYSTYPE *yylval, void
*dst = 0;
break;
default:
-#ifdef SQL_STRINGS_USE_ESCAPES
-   if (GDKgetenv_istrue("raw_strings") ||
-   lc->next_string_is_raw) {
+   if (lc->raw_string_mode || lc->next_string_is_raw) {
dst = str;
for (char *src = yylval->sval + 1; *src; dst++)
if ((*dst = *src++) == '\'' && *src == 
'\'')
@@ -1349,23 +1340,14 @@ sql_get_next_token(YYSTYPE *yylval, void
  (unsigned char *)yylval->sval + 1,
  lc->yycur - lc->yysval - 1);
}
-#else
-   dst = str;
-   for (char *src = yylval->sval + 1; *src; dst++)
-   if ((*dst = *src++) == '\'' && *src == '\'')
-   src++;
-   *dst = 0;
-#endif
break;
}
yylval->sval = str;
 
/* reset original */
lc->rs->buf[lc->rs->pos+lc->yycur- 1] = quote;
-#ifdef SQL_STRINGS_USE_ESCAPES
} else {
lc->next_string_is_raw = false;
-#endif
}
 
return(token);
diff --git a/sql/server/sql_scan.h b/sql/server/sql_scan.h
--- a/sql/server/sql_scan.h
+++ b/sql/server/sql_scan.h
@@ -15,11 +15,6 @@
 
 typedef enum { LINE_1, LINE_N } prot;
 
-/* Currently, MonetDB interprets \ specially in strings.  This is
- * contrary to the SQL standard.  Remove this define to revert to the
- * standard interpretation. */
-#define SQL_STRINGS_USE_ESCAPES 1
-
 struct scanner {
bstream *rs;
stream *ws;
@@ -36,15 +31,31 @@ struct scanner {
prot mode;  /* which mode (line (1,N), blocked) */
char *schema;   /* Keep schema name of create statement, needed 
AUTO_INCREMENT, SERIAL */
char *errstr;   /* error message from the bowels of the scanner */
-#ifdef SQL_STRINGS_USE_ESCAPES
-   /* because we interpret \ in strings, we need state in the
-* scanner so that we Do The Right Thing (TM) when we get a
-* unicode string split up in multiple parts (i.e. 

MonetDB: string_imprints - Merge with default

2021-07-01 Thread Panagiotis Koutsourakis
Changeset: 80f037721006 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/80f037721006
Modified Files:
gdk/gdk_private.h
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 5721 to 300 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -541,8 +541,8 @@ gdk_return log_bat_transient(logger *lg,
 gdk_return log_constant(logger *lg, int type, ptr val, log_id id, lng offset, 
lng cnt);
 gdk_return log_delta(logger *lg, BAT *uid, BAT *uval, log_id id);
 gdk_return log_sequence(logger *lg, int seq, lng id);
-gdk_return log_tend(logger *lg);
-gdk_return log_tstart(logger *lg, ulng commit_ts, bool flush);
+gdk_return log_tend(logger *lg, ulng commit_ts);
+gdk_return log_tstart(logger *lg, bool flush);
 gdk_return logger_activate(logger *lg);
 lng logger_changes(logger *lg);
 logger *logger_create(int debug, const char *fn, const char *logdir, int 
version, preversionfix_fptr prefuncp, postversionfix_fptr postfuncp, void 
*funcdata);
@@ -761,6 +761,7 @@ void MCcloseClient(Client c);
 Client MCforkClient(Client father);
 Client MCgetClient(int id);
 Client MCinitClient(oid user, bstream *fin, stream *fout);
+size_t MCmemoryClaim(void);
 int MCpushClientInput(Client c, bstream *new_input, int listing, char *prompt);
 void MCstopClients(Client c);
 str MCsuspendClient(int id);
diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c
--- a/gdk/gdk_bat.c
+++ b/gdk/gdk_bat.c
@@ -809,12 +809,12 @@ COLcopy(BAT *b, int tt, bool writable, r
 
/* first try case (1); create a view, possibly with different
 * atom-types */
-   if (role == b->batRole &&
+   if (!writable &&
+   role == b->batRole &&
b->batRestricted == BAT_READ &&
ATOMstorage(b->ttype) != TYPE_msk && /* no view on TYPE_msk */
(!VIEWtparent(b) ||
-BBP_cache(VIEWtparent(b))->batRestricted == BAT_READ) &&
-   !writable) {
+BBP_cache(VIEWtparent(b))->batRestricted == BAT_READ)) {
bn = VIEWcreate(b->hseqbase, b);
if (bn == NULL)
return NULL;
@@ -875,8 +875,8 @@ COLcopy(BAT *b, int tt, bool writable, r
strconcat_len(thp.filename, sizeof(thp.filename),
  BBP_physical(bn->batCacheid),
  ".theap", NULL);
-   if ((b->ttype && HEAPcopy(, b->theap) != 
GDK_SUCCEED) ||
-   (bn->tvheap && HEAPcopy(, b->tvheap) != 
GDK_SUCCEED)) {
+   if ((b->ttype && HEAPcopy(, b->theap, b->tbaseoff 
<< b->tshift) != GDK_SUCCEED) ||
+   (bn->tvheap && HEAPcopy(, b->tvheap, 0) != 
GDK_SUCCEED)) {
HEAPfree(, true);
HEAPfree(, true);
BBPreclaim(bn);
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -32,7 +32,7 @@ unshare_varsized_heap(BAT *b)
h->farmid = BBPselectfarm(b->batRole, TYPE_str, varheap);
strconcat_len(h->filename, sizeof(h->filename),
  BBP_physical(b->batCacheid), ".theap", NULL);
-   if (HEAPcopy(h, b->tvheap) != GDK_SUCCEED) {
+   if (HEAPcopy(h, b->tvheap, 0) != GDK_SUCCEED) {
HEAPfree(h, true);
GDKfree(h);
return GDK_FAIL;
@@ -496,7 +496,7 @@ append_varsized_bat(BAT *b, BAT *n, stru
h->farmid = BBPselectfarm(b->batRole, b->ttype, varheap);
strconcat_len(h->filename, sizeof(h->filename),
  BBP_physical(b->batCacheid), ".theap", NULL);
-   if (HEAPcopy(h, b->tvheap) != GDK_SUCCEED) {
+   if (HEAPcopy(h, b->tvheap, 0) != GDK_SUCCEED) {
HEAPfree(h, true);
GDKfree(h);
return GDK_FAIL;
diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c
--- a/gdk/gdk_group.c
+++ b/gdk/gdk_group.c
@@ -453,41 +453,43 @@ rev(oid x)
return x;
 }
 
-/* population count: count number of 1 bits in a value */
-static inline int
-pop(oid x)
+/* count trailing zeros, also see candmask_lobit in gdk_cand.h */
+static inline int __attribute__((__const__))
+ctz(oid x)
 {
-#ifdef __GNUC__
+#if defined(__GNUC__)
 #if SIZEOF_OID == SIZEOF_INT
-   return __builtin_popcount(x);
+   return __builtin_ctz(x);
 #else
-   return __builtin_popcountl(x);
+   return __builtin_ctzl(x);
 #endif
-#else
-#ifdef _MSC_VER
+#elif defined(_MSC_VER)
 #if SIZEOF_OID == SIZEOF_INT
-   return (int) __popcnt((unsigned int) (x));
-#else
-   return (int) __popcnt64((unsigned __int64) (x));
-#endif
+   unsigned long idx;
+   if (_BitScanForward(, (unsigned long) x))
+   

MonetDB: string_imprints - Fix strimp generation bugs

2021-07-01 Thread Panagiotis Koutsourakis
Changeset: a7567eea4081 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/a7567eea4081
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix strimp generation bugs


diffs (55 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -133,12 +133,10 @@ next_pair(PairIterator *pi) {
 
 static int8_t
 STRMPpairLookup(Strimps *s, CharPair *p) {
-   int8_t ret = -1;
size_t idx = 0;
size_t npairs = NPAIRS(((uint64_t *)s->strimps.base)[0]);
size_t offset = 0;
CharPair sp;
-   (void)p;
 
for (idx = 0; idx < npairs; idx++) {
sp.psize = s->sizes_base[idx];
@@ -148,7 +146,7 @@ STRMPpairLookup(Strimps *s, CharPair *p)
offset += sp.psize;
}
 
-   return ret;
+   return -1;
 }
 
 static bool
@@ -160,8 +158,8 @@ ignored(CharPair *p, uint8_t elm) {
 #define MAX_PAIR_SIZE 8
 
 /* Given a strimp header and a string compute the bitstring of which
- * digrams(byte pairs) are present in the string. The strimp header is a
- * map from digram(byte pair) to index in the strimp.
+ * digrams are present in the string. The strimp header is a map from
+ * digram to index in the strimp.
  *
  * This should probably be inlined.
  */
@@ -179,8 +177,8 @@ STRMPmakebitstring(const str s, Strimps 
 
while(pair_at(, )) {
pair_idx = STRMPpairLookup(r, );
-   if (pair_idx > 0)
-   ret |= 0x1 << pair_idx;
+   if (pair_idx >= 0)
+   ret |= ((uint64_t)0x1 << pair_idx);
next_pair();
}
 
@@ -617,7 +615,6 @@ STRMPcreate(BAT *b)
 }
 
 /* Left over code */
-
 #if 0
 /* This counts how many unicode codepoints the given string
  * contains.
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-07-01 Thread Panagiotis Koutsourakis
Changeset: cffe5ff7bdad for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/cffe5ff7bdad
Modified Files:
gdk/gdk.h
gdk/gdk_private.h
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 125754 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -5752,7 +5752,7 @@ stdout of test 'MAL-signatures` in direc
 [ "batcalc",   "timestamp","pattern batcalc.timestamp(X_0:bat[:timestamp], 
X_1:bat[:oid], X_2:int):bat[:timestamp] ",  "timestamp_2time_timestamp;",   
""  ]
 [ "batcalc",   "uuid", "command batcalc.uuid(X_0:bat[:str], 
X_1:bat[:oid]):bat[:uuid] ",   "UUIDstr2uuid_bulk;",   ""  ]
 [ "batcalc",   "uuid", "command batcalc.uuid(X_0:bat[:uuid], 
X_1:bat[:oid]):bat[:uuid] ",  "UUIDuuid2uuid_bulk;",  ""  ]
-[ "batcalc",   "wkb",  "command batcalc.wkb(X_0:bat[:wkb], X_1:int, 
X_2:int):bat[:wkb] ",  "geom_2_geom_bat;", ""  ]
+[ "batcalc",   "wkb",  "command batcalc.wkb(X_0:bat[:wkb], X_1:bat[:oid], 
X_2:int, X_3:int):bat[:wkb] ",   "geom_2_geom_bat;", ""  ]
 [ "batcalc",   "xml",  "command batcalc.xml(X_0:bat[:str]):bat[:xml] ",
"BATXMLstr2xml;",   ""  ]
 [ "batcalc",   "xor",  "pattern batcalc.xor(X_0:bat[:bit], 
X_1:bat[:bit]):bat[:bit] ", "CMDbatXOR;",   ""  ]
 [ "batcalc",   "xor",  "pattern batcalc.xor(X_0:bat[:bit], X_1:bat[:bit], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ",   "CMDbatXOR;",   ""  ]
@@ -9208,7 +9208,7 @@ stdout of test 'MAL-signatures` in direc
 [ "sql",   "analyze",  "unsafe pattern sql.analyze(X_0:int, X_1:lng, 
X_2:str, X_3:str):void ", "sql_analyze;", ""  ]
 [ "sql",   "analyze",  "unsafe pattern sql.analyze(X_0:int, X_1:lng, 
X_2:str, X_3:str, X_4:str):void ","sql_analyze;", ""  ]
 [ "sql",   "any",  "pattern sql.any(X_0:bit, X_1:bit, X_2:bit):bit ",  
"SQLany_cmp;",  ""  ]
-[ "sql",   "append",   "pattern sql.append(X_0:int, X_1:str, X_2:str, 
X_3:str, X_4:lng, X_5:any):int ","mvc_append_wrap;", ""  ]
+[ "sql",   "append",   "pattern sql.append(X_0:int, X_1:str, X_2:str, 
X_3:str, X_4:bat[:oid], X_5:any):int ",  "mvc_append_wrap;", ""  ]
 [ "sql",   "argRecord","pattern sql.argRecord():str ", 
"SQLargRecord;",""  ]
 [ "sql",   "argRecord","pattern sql.argRecord(X_0:any...):str ",   
"SQLargRecord;",""  ]
 [ "sql",   "assert",   "pattern sql.assert(X_0:bit, X_1:str):void ",   
"SQLassert;",   ""  ]
@@ -9232,7 +9232,7 @@ stdout of test 'MAL-signatures` in direc
 [ "sql",   "bind_idxbat",  "pattern sql.bind_idxbat(X_0:int, X_1:str, 
X_2:str, X_3:str, X_4:int):bat[:any_1] ","mvc_bind_idxbat_wrap;","" 
 ]
 [ "sql",   "bind_idxbat",  "pattern sql.bind_idxbat(X_0:int, X_1:str, 
X_2:str, X_3:str, X_4:int, X_5:int, X_6:int) (X_7:bat[:oid], X_8:bat[:any_1]) 
", "mvc_bind_idxbat_wrap;",""  ]
 [ "sql",   "bind_idxbat",  "pattern sql.bind_idxbat(X_0:int, X_1:str, 
X_2:str, X_3:str, X_4:int, X_5:int, X_6:int):bat[:any_1] ",  
"mvc_bind_idxbat_wrap;",""  ]
-[ "sql",   "claim","unsafe pattern sql.claim(X_0:int, X_1:str, 
X_2:str, X_3:lng):lng ","mvc_claim_wrap;",  ""  ]
+[ "sql",   "claim","unsafe pattern sql.claim(X_0:int, X_1:str, 
X_2:str, X_3:lng):bat[:oid] ",  "mvc_claim_wrap;",  ""  ]
 [ "sql",   "clear_table",  "unsafe pattern sql.clear_table(X_0:str, 
X_1:str):lng ","mvc_clear_table_wrap;",""  ]
 [ "sql",   "commit",   "unsafe pattern sql.commit():void ",
"SQLcommit;",   ""  ]
 [ "sql",   "copy_from","unsafe pattern sql.copy_from(X_0:ptr, X_1:str, 
X_2:str, X_3:str, X_4:str, X_5:str, X_6:lng, X_7:lng, X_8:int, X_9:str, 
X_10:int, X_11:int):bat[:any]... ", "mvc_import_table_wrap;",   ""  
]
@@ -9244,6 +9244,7 @@ stdout of test 'MAL-signatures` in direc
 [ "sql",   "corr", "pattern sql.corr(X_0:int, X_1:int, X_2:bit, X_3:bit, 
X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcorr;", ""  ]
 [ "sql",   "corr", "pattern sql.corr(X_0:lng, X_1:lng, X_2:bit, X_3:bit, 
X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcorr;", ""  ]
 [ "sql",   "corr", "pattern sql.corr(X_0:sht, X_1:sht, X_2:bit, X_3:bit, 
X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcorr;", ""  ]
+[ "sql",   "count","pattern sql.count(X_0:str, X_1:str):lng ", 
"SQLbasecount;",""  ]
 [ "sql",   "count","pattern sql.count(X_0:any_1, X_1:bit, X_2:bit, 
X_3:bit, X_4:int, X_5:oid, X_6:oid):lng ",  "SQLcount;",""  ]
 [ "sql",   "covariance",   "pattern sql.covariance(X_0:bte, X_1:bte, 
X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ",   "SQLcovar_samp;",   
""  ]
 [ "sql",   "covariance",   

MonetDB: string_imprints - Initial implementation of the strimp ...

2021-07-01 Thread Panagiotis Koutsourakis
Changeset: 4ad4318de13e for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/4ad4318de13e
Added Files:
sql/scripts/90_strimps.sql
Modified Files:
gdk/gdk_strimps.c
monetdb5/modules/mal/batExtensions.c
sql/backends/monet5/CMakeLists.txt
sql/scripts/CMakeLists.txt
Branch: string_imprints
Log Message:

Initial implementation of the strimp filter


diffs (145 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -455,6 +455,7 @@ STRMPfilter(BAT *b, char *q)
BUN i;
uint64_t qbmask;
uint64_t *ptr;
+   int zz = 0;
 
 
if (b->tstrimps == NULL)
@@ -471,15 +472,19 @@ STRMPfilter(BAT *b, char *q)
qbmask = STRMPmakebitstring(q, b->tstrimps);
ptr = (uint64_t *)b->tstrimps->strimps_base;
 
-
for (i = 0; i < b->batCount; i++) {
-   if ((*ptr & qbmask) == qbmask) {
+   if ((*(ptr + i) & qbmask) == qbmask) {
oid pos = i;
if (BUNappend(r, , false) != GDK_SUCCEED)
goto sfilter_fail;
}
+   else {
+   zz++;
+   }
}
+   printf("filtered out: %d entries\n", zz);
 
+   r->tkey = true;
return virtualize(r);
 
 
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -340,7 +340,7 @@ PATstrimp_makehist(Client cntxt, MalBlkP
 }
 #endif
 static str
-PATstrimp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+PATstrimpCreate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
bat bid;
BAT *b;
@@ -358,6 +358,50 @@ PATstrimp(Client cntxt, MalBlkPtr mb, Ma
return MAL_SUCCEED;
 }
 
+static str
+PATstrimpFilter(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) {
+   (void)cntxt;
+   (void)mb;
+   (void)stk;
+   (void)pci;
+   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) "UNIMPLEMENTED");
+}
+
+static str
+PATstrimpFilterSelect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+   bat bid, sid;
+   BAT *b, *s, *ob;
+   str pat;
+
+   (void)cntxt;
+   (void)mb;
+
+   bid = *getArgReference_bat(stk, pci, 1);
+   if ((b = BATdescriptor(bid)) == NULL)
+   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+
+   sid = *getArgReference_bat(stk, pci, 2);
+   if ((s = BATdescriptor(sid)) == NULL)
+   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+
+   assert(s->ttype == TYPE_void);
+
+   if (!STRMPcreate(b)) {
+   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) 
OPERATION_FAILED);
+   }
+
+   pat = *getArgReference_str(stk, pci, 3);
+   if ((ob = STRMPfilter(b, pat)) == NULL) {
+   BBPunfix(b->batCacheid);
+   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002));
+   }
+
+   *getArgReference_bat(stk, pci, 0) = ob->batCacheid;
+   BBPkeepref(ob->batCacheid);
+
+   return MAL_SUCCEED;
+}
 
 #include "mel.h"
 mel_func batExtensions_init_funcs[] = {
@@ -392,7 +436,10 @@ mel_func batExtensions_init_funcs[] = {
  /* String imprints */
  // pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams 
in a string bat", args(1,2,arg("",lng),batarg("b",str))),
  // pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), 
batarg("",lng),batarg("b",str))),
- pattern("bat", "strimp", PATstrimp, false, "construct the strimp a BAT", 
args(1,2,arg("",void),batarg("b",str))),
+ pattern("bat", "mkstrimp", PATstrimpCreate, false, "construct the strimp a 
BAT", args(1,2,arg("",void),batarg("b",str))),
+ pattern("bat", "strimpfilter", PATstrimpFilter, false, "", 
args(1,3,arg("",bit),arg("b",str),arg("q",str))),
+ pattern("bat", "strimpfilterselect", PATstrimpFilterSelect, false, "", 
args(1,5,batarg("",oid),batarg("b",str),batarg("s",oid),arg("q",str),arg("a",bit))),
+ pattern("bat", "strimpfilterjoin", PATstrimpFilter, false, "", 
args(2,8,batarg("",oid),batarg("b",str),arg("q",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/sql/backends/monet5/CMakeLists.txt 
b/sql/backends/monet5/CMakeLists.txt
--- a/sql/backends/monet5/CMakeLists.txt
+++ b/sql/backends/monet5/CMakeLists.txt
@@ -40,7 +40,8 @@ set(include_sql_files
   75_storagemodel
   76_dump
   80_statistics
-  81_tracer)
+  81_tracer
+  90_strimps)
 
 if(HAVE_HGE)
   list(APPEND include_sql_files
diff --git a/sql/scripts/90_strimps.sql b/sql/scripts/90_strimps.sql
new file mode 100644
--- /dev/null
+++ b/sql/scripts/90_strimps.sql
@@ -0,0 +1,8 @@
+create schema strimps;
+
+-- create procedure strimps.strmpcreate(b string)
+-- external name bat.strimpCreate;
+-- grant execute on procedure 

MonetDB: string_imprints - Initial sql interface

2021-07-05 Thread Panagiotis Koutsourakis
Changeset: 0713d2b9a640 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0713d2b9a640
Added Files:
sql/backends/monet5/sql_strimps.c
sql/backends/monet5/sql_strimps.h
Modified Files:
sql/backends/monet5/CMakeLists.txt
sql/backends/monet5/sql.c
sql/scripts/90_strimps.sql
Branch: string_imprints
Log Message:

Initial sql interface

To use, first construct the strimp by specifying the full column name:

sql>call createstrimps('schema', 'table', 'column');

given that you can filter the column using filter syntax:

sql>select column from table where [column] strimps.filter ['query string'];


diffs (147 lines):

diff --git a/sql/backends/monet5/CMakeLists.txt 
b/sql/backends/monet5/CMakeLists.txt
--- a/sql/backends/monet5/CMakeLists.txt
+++ b/sql/backends/monet5/CMakeLists.txt
@@ -78,6 +78,7 @@ target_sources(sql
   sql_round_impl.h
   sql_fround.c sql_fround_impl.h
   sql_orderidx.c sql_orderidx.h
+  sql_strimps.c sql_strimps.h
   sql_time.c
   sql_bincopyfrom.c
   wlr.c wlr.h
diff --git a/sql/backends/monet5/sql.c b/sql/backends/monet5/sql.c
--- a/sql/backends/monet5/sql.c
+++ b/sql/backends/monet5/sql.c
@@ -4929,6 +4929,7 @@ finalize:
 #include "sql_assert.h"
 #include "sql_execute.h"
 #include "sql_orderidx.h"
+#include "sql_strimps.h"
 #include "sql_subquery.h"
 #include "sql_statistics.h"
 #include "sql_transaction.h"
@@ -5048,6 +5049,7 @@ static mel_func sql_init_funcs[] = {
  pattern("sql", "storage", sql_storage, false, "return a table with storage 
information for a particular column", args(17,20, 
batarg("schema",str),batarg("table",str),batarg("column",str),batarg("type",str),batarg("mode",str),batarg("location",str),batarg("count",lng),batarg("atomwidth",int),batarg("columnsize",lng),batarg("heap",lng),batarg("hashes",lng),batarg("phash",bit),batarg("imprints",lng),batarg("sorted",bit),batarg("revsorted",bit),batarg("key",bit),batarg("orderidx",lng),arg("sname",str),arg("tname",str),arg("cname",str))),
  pattern("sql", "createorderindex", sql_createorderindex, true, "Instantiate 
the order index on a column", args(0,3, 
arg("sch",str),arg("tbl",str),arg("col",str))),
  pattern("sql", "droporderindex", sql_droporderindex, true, "Drop the order 
index on a column", args(0,3, arg("sch",str),arg("tbl",str),arg("col",str))),
+ pattern("sql", "createstrimps", sql_createstrimps, true, "Instantiate the 
strimps index on a column", args(0,3, 
arg("sch",str),arg("tbl",str),arg("col",str))),
  command("calc", "identity", SQLidentity, false, "Returns a unique row 
identitfier.", args(1,2, arg("",oid),argany("",0))),
  command("batcalc", "identity", BATSQLidentity, false, "Returns the unique row 
identitfiers.", args(1,2, batarg("",oid),batargany("b",0))),
  pattern("batcalc", "identity", PBATSQLidentity, false, "Returns the unique 
row identitfiers.", args(2,4, 
batarg("resb",oid),arg("ns",oid),batargany("b",0),arg("s",oid))),
diff --git a/sql/backends/monet5/sql_strimps.c 
b/sql/backends/monet5/sql_strimps.c
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/sql_strimps.c
@@ -0,0 +1,73 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
+ */
+
+#include "monetdb_config.h"
+#include "mal_backend.h"
+#include "sql_strimps.h"
+
+static str
+sql_load_bat(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci, BAT **b)
+{
+   mvc *m = NULL;
+   str msg = getSQLContext(cntxt, mb, , NULL);
+   str sch,tbl,col;
+   sql_schema *s;
+   sql_table *t;
+   sql_column *c;
+
+   if (msg != MAL_SUCCEED || (msg = checkSQLContext(cntxt)) != NULL)
+   return msg;
+
+   sch = *getArgReference_str(stk, pci, 1);
+   tbl = *getArgReference_str(stk, pci, 2);
+   col = *getArgReference_str(stk, pci, 3);
+
+   if (!(s = mvc_bind_schema(m, sch)))
+   throw(SQL, "sql.createstrimps", SQLSTATE(3FOOO) "Unknown schema 
%s", sch);
+
+   if (!mvc_schema_privs(m, s))
+   throw(SQL, "sql.createstrimps", SQLSTATE(42000) "Access denied 
for %s to schema '%s'",
+ get_string_global_var(m, "current_user"), 
s->base.name);
+   if (!(t = mvc_bind_table(m, s, tbl)) || !isTable(t))
+   throw(SQL, "sql.createstrimps", SQLSTATE(42S02) "Unknown table 
%s.%s", sch, tbl);
+   if (!(c = mvc_bind_column(m, t, col)))
+   throw(SQL, "sql.createstrimps", SQLSTATE(38000) "Unknown column 
%s.%s.%s", sch, tbl, col);
+
+   sqlstore *store = m->session->tr->store;
+   *b = store->storage_api.bind_col(m->session->tr, c, 0);
+   if (*b == 0)
+   throw(SQL, "sql.createstrimps", SQLSTATE(HY005) "Cannot access 
column %s", col);
+
+   return msg;
+
+}
+
+str
+sql_createstrimps(Client cntxt, MalBlkPtr mb, MalStkPtr stk, 

MonetDB: string_imprints - Extract a separate MAL module for str...

2021-07-05 Thread Panagiotis Koutsourakis
Changeset: 1bb96864a107 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/1bb96864a107
Added Files:
monetdb5/modules/mal/strimps.c
Modified Files:
monetdb5/modules/mal/CMakeLists.txt
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Extract a separate MAL module for strimps


diffs (261 lines):

diff --git a/monetdb5/modules/mal/CMakeLists.txt 
b/monetdb5/modules/mal/CMakeLists.txt
--- a/monetdb5/modules/mal/CMakeLists.txt
+++ b/monetdb5/modules/mal/CMakeLists.txt
@@ -43,6 +43,7 @@ target_sources(malmodules
   projectionpath.c
   tablet.c tablet.h
   batcalc.c calc.c
+  strimps.c
   PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/wlc.h)
 
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -339,69 +339,6 @@ PATstrimp_makehist(Client cntxt, MalBlkP
return MAL_SUCCEED;
 }
 #endif
-static str
-PATstrimpCreate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-   bat bid;
-   BAT *b;
-   (void)cntxt;
-   (void)mb;
-
-   bid = *getArgReference_bat(stk, pci, 1);
-   if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
-
-   if(STRMPcreate(b) != GDK_SUCCEED)
-   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
-
-   // *getArgReference_lng(stk, pci, 0) = 0;
-   return MAL_SUCCEED;
-}
-
-static str
-PATstrimpFilter(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) {
-   (void)cntxt;
-   (void)mb;
-   (void)stk;
-   (void)pci;
-   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) "UNIMPLEMENTED");
-}
-
-static str
-PATstrimpFilterSelect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-   bat bid, sid;
-   BAT *b, *s, *ob;
-   str pat;
-
-   (void)cntxt;
-   (void)mb;
-
-   bid = *getArgReference_bat(stk, pci, 1);
-   if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
-
-   sid = *getArgReference_bat(stk, pci, 2);
-   if ((s = BATdescriptor(sid)) == NULL)
-   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
-
-   assert(s->ttype == TYPE_void);
-
-   if (!STRMPcreate(b)) {
-   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) 
OPERATION_FAILED);
-   }
-
-   pat = *getArgReference_str(stk, pci, 3);
-   if ((ob = STRMPfilter(b, pat)) == NULL) {
-   BBPunfix(b->batCacheid);
-   throw(MAL, "bat.strimpfilter", SQLSTATE(HY002));
-   }
-
-   *getArgReference_bat(stk, pci, 0) = ob->batCacheid;
-   BBPkeepref(ob->batCacheid);
-
-   return MAL_SUCCEED;
-}
 
 #include "mel.h"
 mel_func batExtensions_init_funcs[] = {
@@ -432,14 +369,6 @@ mel_func batExtensions_init_funcs[] = {
 #endif
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),varargany("ins",1))),
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),batvarargany("ins",1))),
-
- /* String imprints */
- // pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams 
in a string bat", args(1,2,arg("",lng),batarg("b",str))),
- // pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), 
batarg("",lng),batarg("b",str))),
- pattern("bat", "mkstrimp", PATstrimpCreate, false, "construct the strimp a 
BAT", args(1,2,arg("",void),batarg("b",str))),
- pattern("bat", "strimpfilter", PATstrimpFilter, false, "", 
args(1,3,arg("",bit),arg("b",str),arg("q",str))),
- pattern("bat", "strimpfilterselect", PATstrimpFilterSelect, false, "", 
args(1,5,batarg("",oid),batarg("b",str),batarg("s",oid),arg("q",str),arg("a",bit))),
- pattern("bat", "strimpfilterjoin", PATstrimpFilter, false, "", 
args(2,8,batarg("",oid),batarg("b",str),arg("q",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/strimps.c
@@ -0,0 +1,157 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
+ */
+#include "monetdb_config.h"
+#include "mal_client.h"
+#include "mal_interpreter.h"
+#include "mal_exception.h"
+
+#if 0
+/*
+ * String imprints.
+ */
+static str
+PATstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+   bat bid;
+   BAT *b;
+   size_t n;
+
+ 

MonetDB: string_imprints - Attempt to make filtering mitosis-aware

2021-07-05 Thread Panagiotis Koutsourakis
Changeset: 4b4623152417 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/4b4623152417
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Attempt to make filtering mitosis-aware

This needs still more work.


diffs (59 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -455,18 +455,17 @@ STRMPfilter(BAT *b, char *q)
BUN i;
uint64_t qbmask;
uint64_t *ptr;
-   int zz = 0;
-
 
if (b->tstrimps == NULL)
goto sfilter_fail;
 
-   r = COLnew(0, TYPE_oid, b->batCount, TRANSIENT);
+   r = COLnew(b->hseqbase, TYPE_oid, b->batCount, TRANSIENT);
if (r == NULL) {
goto sfilter_fail;
}
 
if (!BATcheckstrimps(b)) {
+   BBPunfix(r->batCacheid);
goto sfilter_fail;
}
qbmask = STRMPmakebitstring(q, b->tstrimps);
@@ -478,18 +477,13 @@ STRMPfilter(BAT *b, char *q)
if (BUNappend(r, , false) != GDK_SUCCEED)
goto sfilter_fail;
}
-   else {
-   zz++;
-   }
}
-   printf("filtered out: %d entries\n", zz);
 
r->tkey = true;
return virtualize(r);
 
 
  sfilter_fail:
-   BBPunfix(r->batCacheid);
return NULL;
 }
 
@@ -579,9 +573,15 @@ STRMPcreate(BAT *b)
assert(b->ttype == TYPE_str);
TRC_DEBUG_IF(ALGO) t0 = GDKusec();
 
+
if (BATcheckstrimps(b))
return GDK_SUCCEED;
 
+   if (VIEWtparent(b)) {
+   assert(b->tstrimps == NULL);
+   b = BBPdescriptor(VIEWtparent(b));
+   }
+
if ((h = STRMPcreateStrimpHeap(b)) == NULL) {
return GDK_FAIL;
}
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-04-22 Thread Panagiotis Koutsourakis
Changeset: 0f94f85f07bf for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0f94f85f07bf
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 889 to 300 lines):

diff --git a/documentation/index.rst b/documentation/index.rst
--- a/documentation/index.rst
+++ b/documentation/index.rst
@@ -16,6 +16,9 @@ Welcome to MonetDB's documentation!
monetdbe/examples
monetdbe/installation
monetdbe/monetdbe_api
+   monetdbe/manual_pages/monetdbe_open
+   monetdbe/manual_pages/monetdbe_options
+   monetdbe/manual_pages/monetdbe_remote
source/intro
source/build
source/build-fedora
diff --git a/gdk/gdk_cand.h b/gdk/gdk_cand.h
--- a/gdk/gdk_cand.h
+++ b/gdk/gdk_cand.h
@@ -177,6 +177,7 @@ gdk_export oid canditer_last(const struc
 gdk_export oid canditer_prev(struct canditer *ci);
 gdk_export oid canditer_peekprev(struct canditer *ci);
 gdk_export oid canditer_idx(const struct canditer *ci, BUN p);
+#define canditer_idx_dense(ci, p) ((p >= (ci)->ncand)?oid_nil:((ci)->seq + p))
 gdk_export void canditer_setidx(struct canditer *ci, BUN p);
 gdk_export void canditer_reset(struct canditer *ci);
 gdk_export BUN canditer_search(const struct canditer *ci, oid o, bool next);
diff --git a/gdk/gdk_project.c b/gdk/gdk_project.c
--- a/gdk/gdk_project.c
+++ b/gdk/gdk_project.c
@@ -20,6 +20,57 @@
  * hseqbase + its batCount.
  */
 
+#define project1_loop(TYPE)\
+static gdk_return  \
+project1_##TYPE(BAT *restrict bn, BAT *restrict l, BAT *restrict r1)   \
+{  \
+   BUN lo, hi; \
+   const TYPE *restrict r1t;   \
+   TYPE *restrict bt;  \
+   oid r1seq, r1end;   \
+   \
+   MT_thread_setalgorithm(__func__);   \
+   r1t = (const TYPE *) Tloc(r1, 0);   \
+   bt = (TYPE *) Tloc(bn, 0);  \
+   r1seq = r1->hseqbase;   \
+   r1end = r1seq + BATcount(r1);   \
+   if (BATtdense(l)) { \
+   if (l->tseqbase < r1seq ||  \
+  (l->tseqbase+BATcount(l)) >= r1end) {\
+   GDKerror("does not match always\n");\
+   return GDK_FAIL;\
+   }   \
+   oid off = l->tseqbase - r1seq;  \
+   r1t += off; \
+   for (lo = 0, hi = BATcount(l); lo < hi; lo++)   \
+   bt[lo] = r1t[lo];   \
+   } else {\
+   const oid *restrict ot = (const oid *) Tloc(l, 0);  \
+   for (lo = 0, hi = BATcount(l); lo < hi; lo++) { \
+   oid o = ot[lo]; \
+   if (o < r1seq || o >= r1end) {  \
+   GDKerror("does not match always\n");\
+   return GDK_FAIL;\
+   }   \
+   bt[lo] = r1t[o - r1seq];\
+   }   \
+   }   \
+   BATsetcount(bn, lo);\
+   return GDK_SUCCEED; \
+}
+
+/* project type switch */
+project1_loop(bte)
+project1_loop(sht)
+project1_loop(int)
+project1_loop(flt)
+project1_loop(dbl)
+project1_loop(lng)
+#ifdef HAVE_HGE
+project1_loop(hge)
+#endif
+project1_loop(uuid)
+
 #define project_loop(TYPE) \
 static gdk_return  \
 project_##TYPE(BAT *restrict bn, BAT *restrict l,  \
@@ -34,6 +85,8 @@ project_##TYPE(BAT *restrict bn, BAT *re
oid r1seq, r1end;   \
oid r2seq, r2end;   \
\
+   if ((!ci || ci->tpe == cand_dense) && l->tnonil && !r2) \
+   return project1_##TYPE(bn, l, r1);  \

MonetDB: string_imprints - Make naming more consistent

2021-04-22 Thread Panagiotis Koutsourakis
Changeset: 5493fe034571 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/5493fe034571
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Make naming more consistent


diffs (131 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -65,7 +65,7 @@
  * contains.
  */
 static size_t
-GDKstrimp_strlen(const uint8_t *s)
+STRMP_strlen(const uint8_t *s)
 {
size_t ret = 0;
size_t i;
@@ -102,7 +102,7 @@ GDKstrimp_strlen(const uint8_t *s)
  * 1 digram starting at character n - 1
  */
 gdk_return
-GDKstrimp_ndigrams(BAT *b, size_t *n)
+STRMPndigrams(BAT *b, size_t *n)
 {
// lng t0;
BUN i;
@@ -119,7 +119,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
*n = 0;
for (i = 0; i < b->batCount; i++) {
s = (char *)BUNtail(bi, i);
-// *n += GDKstrimp_strlen(s) - 1;
+// *n += STRMP_strlen(s) - 1;
*n += strlen(s) - 1;
// TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
}
@@ -152,7 +152,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
  * count.
  */
 gdk_return
-GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*nbins)
+STRMPmakehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
 {
lng t0=0;
size_t hi;
@@ -269,7 +269,7 @@ create_header(BAT *b)
if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL)
return NULL;
 
-   if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, ) != 
GDK_SUCCEED) {
+   if(STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) 
{
GDKfree(header);
return NULL;
}
@@ -307,7 +307,7 @@ lookup_index(StrimpHeader *h, DataPair n
  * This should probably be inlined.
  */
 static uint64_t
-GDKstrimp_make_bitstring(const str s, StrimpHeader *h)
+STRMPmakebitstring(const str s, StrimpHeader *h)
 {
uint64_t ret = 0;
int8_t pair_idx;
@@ -379,7 +379,7 @@ create_strimp_heap(BAT *b, StrimpHeader 
 
 /* Create */
 gdk_return
-GDKstrimp_create_strimp(BAT *b)
+STRMPcreate(BAT *b)
 {
lng t0 = 0;
BATiter bi;
@@ -401,13 +401,13 @@ GDKstrimp_create_strimp(BAT *b)
GDKfree(head);
return GDK_FAIL;
}
-   dh = (uint64_t *)h->base + h->free;
+   dh = (uint64_t *)h->base + h->free; // That's probably not 
correct
 
bi = bat_iterator(b);
for (i = 0; i < b->batCount; i++) {
s = (str)BUNtvar(bi, i);
if (!strNil(s))
-   *dh++ = GDKstrimp_make_bitstring(s, head);
+   *dh++ = STRMPmakebitstring(s, head);
else
*dh++ = 0; /* no pairs in nil values */
 
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -27,9 +27,10 @@ typedef struct {
DataPair bytepairs[STRIMP_HEADER_SIZE];
 } StrimpHeader;
 
-gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); // Remove?
-gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins); // make static
-// gdk_export gdk_return GDKstrimp_make_header(StrimpHeader *h, uint64_t 
*hist, size_t hist_size); // make static
-//gdk_export gdk_return GDKstrimp_make_header(BAT *b);
-gdk_export gdk_return GDKstrimp_create_strimp(BAT *b);
+gdk_export gdk_return STRMPndigrams(BAT *b, size_t *n); // Remove?
+gdk_export gdk_return STRMPmakehistogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins); // make static
+// gdk_export gdk_return STRMP_make_header(StrimpHeader *h, uint64_t *hist, 
size_t hist_size); // make static
+//gdk_export gdk_return STRMP_make_header(BAT *b);
+gdk_export gdk_return STRMPcreate(BAT *b);
+gdk_export BAT *STRMPfilter(BAT *b, char *q);
 #endif /* _GDK_STRIMPS_H_ */
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -284,7 +284,7 @@ PATstrimp_ndigrams(Client cntxt, MalBlkP
if ((b = BATdescriptor(bid)) == NULL)
throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
-   if (!GDKstrimp_ndigrams(b, )) {
+   if (!STRMPndigrams(b, )) {
throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
}
 
@@ -309,7 +309,7 @@ PATstrimp_makehist(Client cntxt, MalBlkP
if ((b = BATdescriptor(bid)) == NULL)
throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
-   if (!GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, )) {
+   if (!STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, )) {

MonetDB: string_imprints - Read persistent strimp [WIP]

2021-04-22 Thread Panagiotis Koutsourakis
Changeset: fa263cc6a470 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/fa263cc6a470
Modified Files:
gdk/gdk.h
gdk/gdk_private.h
gdk/gdk_strimps.c
gdk/gdk_strimps.h
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Read persistent strimp [WIP]


diffs (truncated from 361 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -568,6 +568,7 @@ typedef struct {
 
 typedef struct Hash Hash;
 typedef struct Imprints Imprints;
+typedef struct Strimps Strimps;
 
 /*
  * @+ Binary Association Tables
@@ -732,7 +733,7 @@ typedef struct {
Hash *hash; /* hash table */
Imprints *imprints; /* column imprints index */
Heap *orderidx; /* order oid index */
-   Heap *strimps;  /* string imprint index  */
+   Strimps *strimps;   /* string imprint index  */
 
PROPrec *props; /* list of dynamic properties stored in the bat 
descriptor */
 } COLrec;
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -18,6 +18,9 @@
 /* persist order index heaps for persistent BATs */
 #define PERSISTENTIDX 1
 
+/* persist strimp heaps for persistent BATs */
+#define PERSISTENTSTRIMP 1
+
 #include "gdk_system_private.h"
 
 enum heaptype {
@@ -384,6 +387,15 @@ struct Imprints {
BUN dictcnt;/* counter for cache dictionary   */
 };
 
+struct Strimps {
+   Heap strimps;
+   void *offsets_base; /* pointer into strimps heap (pair offsets)  */
+   /* offsets_base is a pointer to either a uint8_t or a uint16_ */
+   uint8_t *pairs_base;/* pointer into strimps heap (pairs start)   */
+   void *strimps_base; /* pointer into strimps heap (strimps start) */
+   /* strimps_base is a pointer to either a uint32_t or a uint64_t */
+};
+
 typedef struct {
MT_Lock swap;
 } batlock_t;
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -91,7 +91,6 @@ STRMP_strlen(const uint8_t *s)
 
return ret;
 }
-#endif
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
@@ -129,6 +128,7 @@ STRMPndigrams(BAT *b, size_t *n)
 
return GDK_SUCCEED;
 }
+#endif
 
 /* The isIgnored is a bit suspect in terms of unicode. There are
  * non-ASCII codepoints that are considered spaces, for example the
@@ -151,8 +151,8 @@ STRMPndigrams(BAT *b, size_t *n)
  * Return the histogram in hist and the number of non-zero bins in
  * count.
  */
-gdk_return
-STRMPmakehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
+static gdk_return
+STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
 {
lng t0=0;
size_t hi;
@@ -269,7 +269,7 @@ create_header(BAT *b)
if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL)
return NULL;
 
-   if(STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) 
{
+   if(STRMPmakehistogramBP(b, hist, STRIMP_HISTSIZE, ) != 
GDK_SUCCEED) {
GDKfree(header);
return NULL;
}
@@ -324,11 +324,11 @@ STRMPmakebitstring(const str s, StrimpHe
 }
 
 /* Create the heap for a string imprint. Returns NULL on failure. */
-static Heap *
-create_strimp_heap(BAT *b, StrimpHeader *h)
+static Strimps *
+create_strimp(BAT *b, StrimpHeader *h)
 {
-   Heap *r = NULL;
uint64_t *d;
+   Strimps *r = NULL;
uint64_t descriptor;
uint64_t npairs, bytes_per_pair, hsize;
size_t i;
@@ -336,15 +336,15 @@ create_strimp_heap(BAT *b, StrimpHeader 
const char *nme;
 
nme = GDKinmemory(b->theap->farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
-   if ((r = GDKzalloc(sizeof(Heap))) == NULL ||
-   (r->farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 ||
-   strconcat_len(r->filename, sizeof(r->filename),
- nme, ".strimp", NULL) >= sizeof(r->filename) ||
-   HEAPalloc(r, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t), 0) != 
GDK_SUCCEED) {
+   if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
+   (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, 
strimpheap)) < 0 ||
+   strconcat_len(r->strimps.filename, sizeof(r->strimps.filename),
+ nme, ".strimp", NULL) >= sizeof(r->strimps.filename) 
||
+   HEAPalloc(>strimps, BATcount(b) + STRIMP_OFFSET, 
sizeof(uint64_t), 0) != GDK_SUCCEED) {
GDKfree(r);
return NULL;
}
-   r->free = STRIMP_OFFSET * sizeof(uint64_t);
+   r->strimps.free = STRIMP_OFFSET * sizeof(uint64_t);
 
npairs = STRIMP_HEADER_SIZE;
bytes_per_pair = 2; /* Bytepair implementation */
@@ -353,9 +353,9 @@ create_strimp_heap(BAT *b, StrimpHeader 

MonetDB: string_imprints - Update comment

2021-04-06 Thread Panagiotis Koutsourakis
Changeset: 0cc344ae7097 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0cc344ae7097
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Update comment


diffs (45 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -16,17 +16,33 @@
  * - a 64 bit mask for each item in the BAT that encodes the presence or
  *   absence of each element of the header in the specific item.
  *
- * A string imprint is stored in a new Heap in the BAT.
+ * A string imprint is stored in a new Heap in the BAT, aligned in 8
+ * byte (64 bit) words.
  *
- * In the current (byte pair) implementation the first 136 bytes
- * (i.e. the first 17 64 bit quantities) in the Heap are as follows:
+ * The first 64 bit word describes how the header of the strimp is
+ * encoded. The most significant byte (v in the schematic below) is the
+ * version number. The second (np) is the number of pairs in the
+ * header. The third (b/p) is the number of bytes per pair if each pair
+ * is encoded using a constant number of bytes or 0 if it is utf-8. The
+ * next 2 bytes (hs) is the size of the header in bytes. The last 3
+ * bytes needed to align to the 8 byte boundary should be zero, and are
+ * reserved for future use.
+ *
+ * In the current implementation we use 64 byte pairs for the header, so
  *
- * |   Version Number  |   -
- * | byte pair 01 | byte pair 02 | byte pair 03 | byte pair 04 | |
- * | byte pair 05 | byte pair 06 | byte pair 07 | byte pair 08 | |  17 64 
bit quantities
- * [...] |
- * | byte pair 61 | byte pair 62 | byte pair 63 | byte pair 64 |   -
+ * np  == 64
+ * b/p == 2
+ * hs  == 128
+ *
+ * The actual header follows. If it ends before an 8 byte boundary it
+ * is padded with zeros.
  *
+ * |  v   |  np   |  b/p |  hs  | reserved |  8bytes
+ * |   |---
+ * Strimp Header |
+ * |   |  hs bytes + 
padding
+ * |   | |
+ * |   |---
  * The bitmasks for each string in the BAT follow after this.
  *
  * Strimp creation goes as follows:
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Fix bitmask construction

2021-04-06 Thread Panagiotis Koutsourakis
Changeset: 14266938fcad for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/14266938fcad
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix bitmask construction

The current bytepair in the string might not exist in the header.


diffs (52 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -60,10 +60,10 @@
 #include "gdk.h"
 #include "gdk_private.h"
 
+#if 0
 /* This counts how many unicode codepoints the given string
  * contains.
  */
-#if 0
 static size_t
 GDKstrimp_strlen(const uint8_t *s)
 {
@@ -251,7 +251,7 @@ make_header(StrimpHeader *h, uint64_t* h
}
 
for(i = 0; i < STRIMP_HEADER_SIZE; i++) {
-   TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), 
indexToPair2(h->bytepairs[i]), max_counts[i]);
+   TRC_DEBUG(ALGO, "0x%x 0x%x: %lu", 
indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]);
}
 
TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
@@ -283,7 +283,7 @@ create_header(BAT *b)
  *
  * h[i] == p
  *
- * Returns 0 if p is not in h.
+ * Returns -1 if p is not in h.
  *
  * TODO: Should this be inlined somehow? (probably yes)
  */
@@ -295,7 +295,7 @@ lookup_index(StrimpHeader *h, DataPair n
if(h->bytepairs[i] == n)
return i;
 
-   return 0;
+   return -1;
 }
 
 
@@ -314,7 +314,8 @@ GDKstrimp_make_bitstring(const str s, St
 
for(it = s; *it != 0 && *(it+1) != 0; it++) {
pair_idx = lookup_index(h, pairToIndex(*it, *(it+1)));
-   ret |= 0x1 << pair_idx;
+   if (pair_idx >= 0)
+   ret |= 0x1 << pair_idx;
}
 
return ret;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Construct strimp descriptor correctly

2021-04-06 Thread Panagiotis Koutsourakis
Changeset: f46a719af133 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/f46a719af133
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Construct strimp descriptor correctly


diffs (92 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -20,7 +20,7 @@
  * byte (64 bit) words.
  *
  * The first 64 bit word describes how the header of the strimp is
- * encoded. The most significant byte (v in the schematic below) is the
+ * encoded. The least significant byte (v in the schematic below) is the
  * version number. The second (np) is the number of pairs in the
  * header. The third (b/p) is the number of bytes per pair if each pair
  * is encoded using a constant number of bytes or 0 if it is utf-8. The
@@ -205,22 +205,23 @@ GDKstrimp_make_histogram(BAT *b, uint64_
return GDK_SUCCEED;
 }
 
-/* Given a histogram find the indices of the 64 largest counts.
+/* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest
+ * counts.
  *
  * We make one scan of histogram and every time we find a count that is
- * greater than the current minimum of the 64, we bubble it up in the
- * header until we find a count that is greater. We carry the index in
- * the histogram because this is the information we are actually
- * interested in keeping.
+ * greater than the current minimum of the STRIMP_HEADER_SIZE, we bubble
+ * it up in the header until we find a count that is greater. We carry
+ * the index in the histogram because this is the information we are
+ * actually interested in keeping.
  *
- * At the end of this process we have the indices of 64 largest counts
- * in the histogram. This process is O(n) in time since we are doing
- * constant work (at most 63 comparisons and swaps) for each item in the
- * histogram and as such is (theoretically) more efficient than sorting
- * (O(nlog n))and taking the 64 largest elements. This depends on the
- * size of the histogram n. For some small n sorting might be more
- * efficient, but for such inputs the difference should not be
- * noticeable.
+ * At the end of this process we have the indices of STRIMP_HEADER_SIZE
+ * largest counts in the histogram. This process is O(n) in time since
+ * we are doing constant work (at most 63 comparisons and swaps) for
+ * each item in the histogram and as such is (theoretically) more
+ * efficient than sorting (O(nlog n))and taking the STRIMP_HEADER_SIZE
+ * largest elements. This depends on the size of the histogram n. For
+ * some small n sorting might be more efficient, but for such inputs the
+ * difference should not be noticeable.
  *
  * In the current implementation each index is a DataPair value that is
  * constructed by pairToIndex from 2 consecutive bytes in the input.
@@ -328,6 +329,9 @@ create_strimp_heap(BAT *b, StrimpHeader 
 {
Heap *r = NULL;
uint64_t *d;
+   uint64_t descriptor;
+   uint8_t npairs, bytes_per_pair;
+   uint16_t hsize;
size_t i,j;
const char *nme;
 
@@ -342,7 +346,17 @@ create_strimp_heap(BAT *b, StrimpHeader 
}
r->free = STRIMP_OFFSET * sizeof(uint64_t);
 
+   npairs = STRIMP_HEADER_SIZE;
+   bytes_per_pair = 2; /* Bytepair implementation */
+   hsize = sizeof(h->bytepairs);
+
+   assert(bytes_per_pair == 0 || npairs*bytes_per_pair == hsize);
+
+   descriptor = 0;
+   descriptor =  STRIMP_VERSION | npairs << 8 | bytes_per_pair << 16 | 
hsize << 24;
+
d = (uint64_t *)r->base;
+   *d++ = descriptor;
/* This loop assumes that we are working with byte pairs
 * (i.e. the type of the header is uint16_t). TODO: generalize.
 */
@@ -352,7 +366,14 @@ create_strimp_heap(BAT *b, StrimpHeader 
*d <<= 16;
*d |= h->bytepairs[i + j];
}
+   d++;
}
+#ifndef NDEBUG
+   FILE *fp = fopen("/tmp/foo.strimp", "wb");
+   fwrite(r->base, sizeof(uint64_t), STRIMP_HEADER_SIZE/4 + 1, fp);
+   fclose(fp);
+#endif
+
return r;
 }
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Take into account negative numbers

2021-04-06 Thread Panagiotis Koutsourakis
Changeset: d74bcfb2b926 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d74bcfb2b926
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Take into account negative numbers


diffs (45 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -287,7 +287,7 @@ create_header(BAT *b)
  *
  * TODO: Should this be inlined somehow? (probably yes)
  */
-static uint8_t
+static int8_t
 lookup_index(StrimpHeader *h, DataPair n)
 {
size_t i;
@@ -309,12 +309,13 @@ static uint64_t
 GDKstrimp_make_bitstring(const str s, StrimpHeader *h)
 {
uint64_t ret = 0;
-   uint8_t pair_idx;
+   int8_t pair_idx;
char *it;
 
for(it = s; *it != 0 && *(it+1) != 0; it++) {
pair_idx = lookup_index(h, pairToIndex(*it, *(it+1)));
if (pair_idx >= 0)
+   assert(pair_idx < STRIMP_HEADER_SIZE);
ret |= 0x1 << pair_idx;
}
 
@@ -323,7 +324,7 @@ GDKstrimp_make_bitstring(const str s, St
 
 /* Create the heap for a string imprint. Returns NULL on failure. */
 static Heap *
-createStrimpheap(BAT *b, StrimpHeader *h)
+create_strimp_heap(BAT *b, StrimpHeader *h)
 {
Heap *r = NULL;
uint64_t *d;
@@ -374,7 +375,7 @@ GDKstrimp_create_strimp(BAT *b)
return GDK_FAIL;
}
 
-   if ((h = createStrimpheap(b, head)) == NULL) {
+   if ((h = create_strimp_heap(b, head)) == NULL) {
GDKfree(head);
return GDK_FAIL;
}
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge branch 'master' into branches/s...

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: 3a4196c618de for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3a4196c618de
Branch: string_imprints
Log Message:

Merge branch 'master' into branches/string_imprints


diffs (truncated from 4430 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -777,7 +777,8 @@ stdout of test 'MAL-signatures` in direc
 [ "batalgebra","not_like", "pattern 
batalgebra.not_like(X_1:bat[:str], X_2:bat[:str], X_3:str):bat[:bit] ",
"BATPCREnotlike;",  ""  ]
 [ "batalgebra","not_like", "pattern 
batalgebra.not_like(X_1:bat[:str], X_2:str, X_3:str):bat[:bit] ",  
"BATPCREnotlike;",  ""  ]
 [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:str, 
X_2:bat[:str], X_3:str):bat[:bit] ",  "BATPCREnotlike;",  ""  ]
-[ "batblob",   "nitems",   "command 
batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", ""  ]
+[ "batblob",   "nitems",   "pattern 
batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", ""  ]
+[ "batblob",   "nitems",   "pattern batblob.nitems(X_1:bat[:blob], 
X_2:bat[:oid]):bat[:int] ", "BLOBnitems_bulk;", ""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_1:any_1, 
X_2:bat[:any_1]):bat[:bit] ","CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1], 
X_3:bat[:oid]):bat[:bit] ", "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1], 
X_3:bat[:oid], X_4:bit):bat[:bit] ","CMDbatNE;",""  ]
@@ -8010,7 +8011,6 @@ stdout of test 'MAL-signatures` in direc
 [ "calc",  "iszero",   "pattern calc.iszero(X_1:int):bit ",
"CMDvarISZERO;",""  ]
 [ "calc",  "iszero",   "pattern calc.iszero(X_1:lng):bit ",
"CMDvarISZERO;",""  ]
 [ "calc",  "iszero",   "pattern calc.iszero(X_1:sht):bit ",
"CMDvarISZERO;",""  ]
-[ "calc",  "json", "command calc.json(X_1:json):json ",
"JSONstr2json;",""  ]
 [ "calc",  "json", "command calc.json(X_1:str):json ", 
"JSONstr2json;",""  ]
 [ "calc",  "length",   "command calc.length(X_1:str):int ",
"CMDstrlength;",""  ]
 [ "calc",  "lng",  "pattern calc.lng(X_1:bit):lng ",   
"CMDvarCONVERT;",   ""  ]
@@ -8814,10 +8814,7 @@ stdout of test 'MAL-signatures` in direc
 [ "json",  "fold", "pattern json.fold(X_1:bat[:str], X_2:bat[:any]):json 
","JSONfold;",""  ]
 [ "json",  "integer",  "command json.integer(X_1:json):lng ",  
"JSONjson2integer;",""  ]
 [ "json",  "isarray",  "command json.isarray(X_1:json):bit ",  
"JSONisarray;", ""  ]
-[ "json",  "isarray",  "command json.isarray(X_1:str):bit ",   
"JSONisarray;", ""  ]
 [ "json",  "isobject", "command json.isobject(X_1:json):bit ", 
"JSONisobject;",""  ]
-[ "json",  "isobject", "command json.isobject(X_1:str):bit ",  
"JSONisobject;",""  ]
-[ "json",  "isvalid",  "command json.isvalid(X_1:json):bit ",  
"JSONisvalid;", ""  ]
 [ "json",  "isvalid",  "command json.isvalid(X_1:str):bit ",   
"JSONisvalid;", ""  ]
 [ "json",  "keyarray", "command json.keyarray(X_1:json):json ",
"JSONkeyArray;",""  ]
 [ "json",  "keys", "command json.keys(X_1:json):bat[:str] ",   
"JSONkeyTable;",""  ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128 
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -892,7 +892,8 @@ stdout of test 'MAL-signatures` in direc
 [ "batalgebra","not_like", "pattern 
batalgebra.not_like(X_1:bat[:str], X_2:bat[:str], X_3:str):bat[:bit] ",
"BATPCREnotlike;",  ""  ]
 [ "batalgebra","not_like", "pattern 
batalgebra.not_like(X_1:bat[:str], X_2:str, X_3:str):bat[:bit] ",  
"BATPCREnotlike;",  ""  ]
 [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:str, 
X_2:bat[:str], X_3:str):bat[:bit] ",  "BATPCREnotlike;",  ""  ]
-[ "batblob",   "nitems",   "command 
batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", ""  ]
+[ "batblob",   "nitems",   "pattern 
batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", ""  ]
+[ "batblob",   "nitems",   "pattern batblob.nitems(X_1:bat[:blob], 
X_2:bat[:oid]):bat[:int] ", "BLOBnitems_bulk;", ""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_1:any_1, 
X_2:bat[:any_1]):bat[:bit] ","CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1], 
X_3:bat[:oid]):bat[:bit] ",

MonetDB: string_imprints - Add documentation and move things arr...

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: e695149dd3ce for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e695149dd3ce
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Add documentation and move things arround


diffs (115 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -94,48 +94,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
((TPE *) _a)[_j] = _t;  \
} while(0)
 
-static StrimpHeader *
-make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size)
-{
-   lng t0 = 0;
-   size_t i;
-   uint64_t max_counts[STRIMP_SIZE] = {0};
-   const size_t cmin_max = STRIMP_SIZE - 1;
-   size_t hidx;
-
-   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
-
-   for(i = 0; i < STRIMP_SIZE; i++)
-   h->bytepairs[i] = 0;
-
-   for(i = 0; i < hist_size; i++) {
-   if (max_counts[cmin_max] < hist[i]) {
-   max_counts[cmin_max] = hist[i];
-   h->bytepairs[cmin_max] = i;
-for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > 
max_counts[hidx-1]; hidx--) {
-   swp(max_counts, hidx, hidx-1, uint64_t);
-   swp(h->bytepairs, hidx, hidx-1, uint16_t);
-   }
-   }
-   }
-
-   for(i = 0; i < STRIMP_SIZE; i++) {
-   TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), 
indexToPair2(h->bytepairs[i]), max_counts[i]);
-   }
-
-   TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0);
-
-   return h;
-}
-
-
-/* static uint64_t */
-/* add_to_header(size_t idx, uint64_t count) */
-/* { */
-/* while */
-/* return GDK_SUCCEED; */
-/* } */
-/* Construct a histogram of pairs of bytes.
+/* Construct a histogram of pairs of bytes in the input BAT.
  *
  * Return the histogram in hist and the number of non-zero bins in
  * count.
@@ -194,8 +153,59 @@ GDKstrimp_make_histogram(BAT *b, uint64_
return GDK_SUCCEED;
 }
 
-gdk_return
-GDKstrimp_make_header(BAT *b)
+/* Given a histogram find the indices of the 64 largest counts.
+ *
+ * We make one scan of histogram and every time we find a count that is
+ * greater than the current minimum of the 64, we bubble it up in the
+ * header until we find a count that is greater. We carry the index in
+ * the histogram because this is the information we are actually
+ * interested in keeping.
+ *
+ * At the end of this process we have the indices of 64 largest counts
+ * in the histogram. This process is O(n) in time since we are doing
+ * constant work (at most 63 comparisons and swaps) for each item in the
+ * histogram and as such is (theoretically) more efficient than sorting
+ * (O(nlog n))and taking the 64 largest elements. This depends on the
+ * size of the histogram n. For some small n sorting might be more
+ * efficient, but for such inputs the difference should not be
+ * noticeable.
+ *
+ * In the current implementation each index is a DataPair value that is
+ * constructed by pairToIndex from 2 consecutive bytes in the input.
+ */
+static StrimpHeader *
+make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size)
+{
+   lng t0 = 0;
+   size_t i;
+   uint64_t max_counts[STRIMP_HEADER_SIZE] = {0};
+   const size_t cmin_max = STRIMP_HEADER_SIZE - 1;
+   size_t hidx;
+
+   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+
+   for(i = 0; i < STRIMP_HEADER_SIZE; i++)
+   h->bytepairs[i] = 0;
+
+   for(i = 0; i < hist_size; i++) {
+   if (max_counts[cmin_max] < hist[i]) {
+   max_counts[cmin_max] = hist[i];
+   h->bytepairs[cmin_max] = i;
+for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > 
max_counts[hidx-1]; hidx--) {
+   swp(max_counts, hidx, hidx-1, uint64_t);
+   swp(h->bytepairs, hidx, hidx-1, DataPair);
+   }
+   }
+   }
+
+   for(i = 0; i < STRIMP_HEADER_SIZE; i++) {
+   TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), 
indexToPair2(h->bytepairs[i]), max_counts[i]);
+   }
+
+   TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
+
+   return h;
+}
 {
uint64_t hist[STRIMP_HISTSIZE] = {0};
size_t nbins = 0;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Some utility functions

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: a13846692aaa for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a13846692aaa
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Some utility functions

- lookup the index of a pair in the header
- construct a bitstring for a given string encoding the presence or
  absence of the pairs in the header

These should probably be inlined.


diffs (68 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -227,33 +227,43 @@ create_header(BAT *b)
 }
 
 
-/* static uint8_t */
-/* lookup_index(StrimpHeader *h, uint16_t n) */
-/* { */
-/* size_t i; */
-/* for(i = 0; i < STRIMP_SIZE; i++) */
-/* if(h->bytepairs[i] == n) */
-/* return i; */
+/* Given a strimp h and a DataPair p, return the index i for which
+ *
+ * h[i] == p
+ *
+ * Returns 0 if p is not in h.
+ *
+ * TODO: Should this be inlined somehow? (probably yes)
+ */
+static uint8_t
+lookup_index(StrimpHeader *h, DataPair n)
+{
+   size_t i;
+   for(i = 0; i < STRIMP_HEADER_SIZE; i++)
+   if(h->bytepairs[i] == n)
+   return i;
 
-/* return 0; */
-/* } */
+   return 0;
+}
 
 
 /* Given a strimp header and a string compute the bitstring of which
  * digrams(byte pairs) are present in the string. The strimp header is a
  * map from digram(byte pair) to index in the strimp.
+ *
+ * This should probably be inlined.
  */
-/* static uint64_t */
-/* GDKstrimp_make_bitstring(str s, StrimpHeader *h) */
-/* { */
-/* uint64_t ret = 0; */
-/* uint8_t pair_idx; */
-/* char *it; */
+static uint64_t
+GDKstrimp_make_bitstring(const str s, StrimpHeader *h)
+{
+   uint64_t ret = 0;
+   uint8_t pair_idx;
+   char *it;
 
-/* for(it = s; *it != 0 && *(it+1) != 0; it++) { */
-/* pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); */
-/* ret |= 0x1 << pair_idx; */
-/* } */
+   for(it = s; *it != 0 && *(it+1) != 0; it++) {
+   pair_idx = lookup_index(h, pairToIndex(*it, *(it+1)));
+   ret |= 0x1 << pair_idx;
+   }
 
-/* return ret; */
-/* } */
+   return ret;
+}
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Expose strimp construction to MAL

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: 532b3fb7b9ff for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=532b3fb7b9ff
Modified Files:
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Expose strimp construction to MAL


diffs (33 lines):

diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -402,7 +402,7 @@ PATstrimp_makehist(Client cntxt, MalBlkP
 }
 
 static str
-PATstrimp_makeheader(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+PATstrimp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
bat bid;
BAT *b;
@@ -413,9 +413,10 @@ PATstrimp_makeheader(Client cntxt, MalBl
if ((b = BATdescriptor(bid)) == NULL)
throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
-   if(GDKstrimp_make_header(b) != GDK_SUCCEED)
+   if(GDKstrimp_create_strimp(b) != GDK_SUCCEED)
throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
 
+   // *getArgReference_lng(stk, pci, 0) = 0;
return MAL_SUCCEED;
 }
 
@@ -452,7 +453,7 @@ mel_func batExtensions_init_funcs[] = {
  /* String imprints */
  pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams in 
a string bat", args(1,2,arg("",lng),batarg("b",str))),
  pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), 
batarg("",lng),batarg("b",str))),
- pattern("bat", "strimpHeader", PATstrimp_makeheader, false, "construct the 
strimp header from a BAT", args(1,2,arg("",void),batarg("b",str))),
+ pattern("bat", "strimp", PATstrimp, false, "construct the strimp a BAT", 
args(1,2,arg("",void),batarg("b",str))),
  { .imp=NULL }
 };
 #include "mal_import.h"
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Add a wrapper that allocates space fo...

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: e09bb9a38502 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e09bb9a38502
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Add a wrapper that allocates space for the header


diffs (32 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -206,17 +206,24 @@ make_header(StrimpHeader *h, uint64_t* h
 
return h;
 }
+
+static StrimpHeader *
+create_header(BAT *b)
 {
uint64_t hist[STRIMP_HISTSIZE] = {0};
size_t nbins = 0;
-   StrimpHeader header;
+   StrimpHeader *header;
+   if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL)
+   return NULL;
+
if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, ) != 
GDK_SUCCEED) {
-   return GDK_FAIL;
+   GDKfree(header);
+   return NULL;
}
 
-   make_header(, hist, STRIMP_HISTSIZE);
+   make_header(header, hist, STRIMP_HISTSIZE);
 
-   return GDK_SUCCEED;
+   return header;
 }
 
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Add some documentation

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: d0711db453cd for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d0711db453cd
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Add some documentation


diffs (105 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -6,6 +6,40 @@
  * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
  */
 
+
+/* A string imprint is an index that can be used as a prefilter in LIKE
+ * queries. It has 2 components:
+ *
+ * - a header of 64 string element pairs (bytes in the current
+ *   implementation but maybe unicode chars might make more sense).
+ *
+ * - a 64 bit mask for each item in the BAT that encodes the presence or
+ *   absence of each element of the header in the specific item.
+ *
+ * A string imprint is stored in a new Heap in the BAT.
+ *
+ * In the current (byte pair) implementation the first 136 bytes
+ * (i.e. the first 17 64 bit quantities) in the Heap are as follows:
+ *
+ * |   Version Number  |   -
+ * | byte pair 01 | byte pair 02 | byte pair 03 | byte pair 04 | |
+ * | byte pair 05 | byte pair 06 | byte pair 07 | byte pair 08 | |  17 64 
bit quantities
+ * [...] |
+ * | byte pair 61 | byte pair 62 | byte pair 63 | byte pair 64 |   -
+ *
+ * The bitmasks for each string in the BAT follow after this.
+ *
+ * Strimp creation goes as follows:
+ *
+ * - Construct a histogram of the element (byte or character) pairs for
+ *   all the strings in the BAT.
+ *
+ * - Take the 64 most frequent pairs as the Strimp Header.
+ *
+ * - For each string in the bat construct a 64 bit mask that encodes the
+ *   presence or absence of each member of the header in the string.
+ */
+
 #include "monetdb_config.h"
 #include "gdk.h"
 #include "gdk_private.h"
@@ -13,33 +47,35 @@
 /* This counts how many unicode codepoints the given string
  * contains.
  */
-/* static size_t */
-/* GDKstrimp_strlen(const uint8_t *s) */
-/* { */
-/* size_t ret = 0; */
-/* size_t i; */
-/* int m,n; */
-/* uint8_t c; */
+#if 0
+static size_t
+GDKstrimp_strlen(const uint8_t *s)
+{
+   size_t ret = 0;
+   size_t i;
+   int m,n;
+   uint8_t c;
 
-/* i = 0; */
-/* while((c = *(s + i)) != 0) { */
-/* if (c < 0x80) */
-/* i++; */
-/* else { */
-/* for (n = 0, m=0x40; c & m; n++, m >>= 1) */
-/* ; */
-/* /\* n is now the number of 10xx bytes that should */
-/*follow. *\/ */
-/* if (n == 0 || n >= 4) */
-/* /\* TODO: handle invalid utf-8 *\/ */
-/* {} */
-/* i += n+1; */
-/* } */
-/* ret++; */
-/* } */
+   i = 0;
+   while((c = *(s + i)) != 0) {
+   if (c < 0x80)
+   i++;
+   else {
+   for (n = 0, m=0x40; c & m; n++, m >>= 1)
+   ;
+   /* n is now the number of 10xx bytes that should
+  follow. */
+   if (n == 0 || n >= 4)
+   /* TODO: handle invalid utf-8 */
+   {}
+   i += n+1;
+   }
+   ret++;
+   }
 
-/* return ret; */
-/* } */
+   return ret;
+}
+#endif
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Count byte pairs instead of unicode c...

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: fbcd6ce89476 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fbcd6ce89476
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Count byte pairs instead of unicode character pairs


diffs (83 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -13,33 +13,33 @@
 /* This counts how many unicode codepoints the given string
  * contains.
  */
-static size_t
-GDKstrimp_strlen(const uint8_t *s)
-{
-   size_t ret = 0;
-   size_t i;
-   int m,n;
-   uint8_t c;
+/* static size_t */
+/* GDKstrimp_strlen(const uint8_t *s) */
+/* { */
+/* size_t ret = 0; */
+/* size_t i; */
+/* int m,n; */
+/* uint8_t c; */
 
-   i = 0;
-   while((c = *(s + i)) != 0) {
-   if (c < 0x80)
-   i++;
-   else {
-   for (n = 0, m=0x40; c & m; n++, m >>= 1)
-   ;
-   /* n is now the number of 10xx bytes that should
-  follow. */
-   if (n == 0 || n >= 4)
-   /* TODO: handle invalid utf-8 */
-   {}
-   i += n+1;
-   }
-   ret++;
-   }
+/* i = 0; */
+/* while((c = *(s + i)) != 0) { */
+/* if (c < 0x80) */
+/* i++; */
+/* else { */
+/* for (n = 0, m=0x40; c & m; n++, m >>= 1) */
+/* ; */
+/* /\* n is now the number of 10xx bytes that should */
+/*follow. *\/ */
+/* if (n == 0 || n >= 4) */
+/* /\* TODO: handle invalid utf-8 *\/ */
+/* {} */
+/* i += n+1; */
+/* } */
+/* ret++; */
+/* } */
 
-   return ret;
-}
+/* return ret; */
+/* } */
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
@@ -55,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
// lng t0;
BUN i;
BATiter bi;
-   uint8_t *s;
+   char *s;
// GDKtracer_set_component_level("ALGO", "DEBUG");
// struct canditer ci;
 
@@ -66,8 +66,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
bi = bat_iterator(b);
*n = 0;
for (i = 0; i < b->batCount; i++) {
-   s = (uint8_t *)BUNtail(bi, i);
-*n += GDKstrimp_strlen(s) - 1;
+   s = (char *)BUNtail(bi, i);
+// *n += GDKstrimp_strlen(s) - 1;
+   *n += strlen(s) - 1;
// TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
}
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Get the correct argument from the MAL...

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: 0d8e5444d101 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0d8e5444d101
Modified Files:
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Get the correct argument from the MAL stack


diffs (12 lines):

diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -409,7 +409,7 @@ PATstrimp_makeheader(Client cntxt, MalBl
(void)cntxt;
(void)mb;
 
-   bid = *getArgReference_bat(stk, pci, 2);
+   bid = *getArgReference_bat(stk, pci, 1);
if ((b = BATdescriptor(bid)) == NULL)
throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Small changes

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: 1ef057324896 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1ef057324896
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Small changes


diffs (21 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -84,7 +84,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
  */
 #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
 #define isNotIgnored(x) (!isIgnored(x))
-#define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2))
+#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b1)<<8 | ((uint8_t)b2))
 #define indexToPair1(idx) (idx & 0xff00) >> 8
 #define indexToPair2(idx) (idx & 0xff)
 #define swp(_a, _i, _j, TPE)   \
@@ -148,7 +148,7 @@ GDKstrimp_make_histogram(BAT *b, uint64_
}
}
 
-   TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0);
+   TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
GDKtracer_flush_buffer();
return GDK_SUCCEED;
 }
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Functions to construct the string imp...

2021-03-04 Thread Panagiotis Koutsourakis
Changeset: 6ab7ac7f1321 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6ab7ac7f1321
Modified Files:
gdk/gdk.h
gdk/gdk_private.h
gdk/gdk_strimps.c
gdk/gdk_strimps.h
Branch: string_imprints
Log Message:

Functions to construct the string imprint for a given BAT


diffs (153 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -701,6 +701,7 @@ typedef struct {
Hash *hash; /* hash table */
Imprints *imprints; /* column imprints index */
Heap *orderidx; /* order oid index */
+   Heap *strimps;  /* string imprint index  */
 
PROPrec *props; /* list of dynamic properties stored in the bat 
descriptor */
 } COLrec;
@@ -772,6 +773,7 @@ typedef struct BATiter {
 #define thash  T.hash
 #define timprints  T.imprints
 #define tprops T.props
+#define tstrimps   T.strimps
 
 
 
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -25,7 +25,8 @@ enum heaptype {
varheap,
hashheap,
imprintsheap,
-   orderidxheap
+   orderidxheap,
+   strimpheap
 };
 
 #ifdef GDKLIBRARY_OLDDATE
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -267,3 +267,85 @@ GDKstrimp_make_bitstring(const str s, St
 
return ret;
 }
+
+/* Create the heap for a string imprint. Returns NULL on failure. */
+static Heap *
+createStrimpheap(BAT *b, StrimpHeader *h)
+{
+   Heap *r = NULL;
+   uint64_t *d;
+   size_t i,j;
+   const char *nme;
+
+   nme = GDKinmemory(b->theap.farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
+   if ((r = GDKzalloc(sizeof(Heap))) == NULL ||
+   (r->farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 ||
+   strconcat_len(r->filename, sizeof(r->filename),
+ nme, ".strimp", NULL) >= sizeof(r->filename) ||
+   HEAPalloc(r, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t)) != 
GDK_SUCCEED) {
+   GDKfree(r);
+   return NULL;
+   }
+   r->free = STRIMP_OFFSET * sizeof(uint64_t);
+
+   d = (uint64_t *)r->base;
+   /* This loop assumes that we are working with byte pairs
+* (i.e. the type of the header is uint16_t). TODO: generalize.
+*/
+   for(i = 0; i < STRIMP_HEADER_SIZE; i += 4) {
+   *d = 0;
+   for(j = 0; j < 4; j++) {
+   *d <<= 16;
+   *d |= h->bytepairs[i + j];
+   }
+   }
+   return r;
+}
+
+/* Create */
+gdk_return
+GDKstrimp_create_strimp(BAT *b)
+{
+   lng t0 = 0;
+   BATiter bi;
+   BUN i;
+   str s;
+   StrimpHeader *head;
+   Heap *h;
+   uint64_t *dh;
+
+   assert(b->ttype == TYPE_str);
+   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+
+   if ((head = create_header(b)) == NULL) {
+   return GDK_FAIL;
+   }
+
+   if ((h = createStrimpheap(b, head)) == NULL) {
+   GDKfree(head);
+   return GDK_FAIL;
+   }
+   dh = (uint64_t *)h->base + h->free;
+
+   bi = bat_iterator(b);
+   for (i = 0; i < b->batCount; i++) {
+   s = (str)BUNtvar(bi, i);
+   if (!strNil(s))
+   *dh++ = GDKstrimp_make_bitstring(s, head);
+   else
+   *dh++ = 0; /* no pairs in nil values */
+
+   }
+
+   /* After we have computed the strimp, attempt to write it back
+* to the BAT.
+*/
+   MT_lock_set(>batIdxLock);
+   b->tstrimps = h;
+   b->batDirtydesc = true;
+   /* persistStrimp(b) */
+   MT_lock_unset(>batIdxLock);
+
+   TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0);
+   return GDK_SUCCEED;
+}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -11,19 +11,25 @@
 
 #include 
 
+
+#define STRIMP_VERSION (uint64_t)1
 /* Count the occurences of pairs of bytes. This is a compromise between
  * just handling ASCII and full UTF-8 support.
  */
 #define STRIMP_HISTSIZE 256*256
-#define STRIMP_SIZE 64
+#define STRIMP_HEADER_SIZE 64
+#define STRIMP_OFFSET 1 + STRIMP_HEADER_SIZE*sizeof(DataPair)/sizeof(uint64_t) 
/* version + header */
 
+
+typedef uint16_t DataPair;
 typedef struct {
// TODO: find a better name for this
-   uint16_t bytepairs[STRIMP_SIZE];
+   DataPair bytepairs[STRIMP_HEADER_SIZE];
 } StrimpHeader;
 
 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); // Remove?
 gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins); // make static
 // gdk_export gdk_return GDKstrimp_make_header(StrimpHeader *h, uint64_t 
*hist, size_t hist_size); // make static
-gdk_export gdk_return GDKstrimp_make_header(BAT *b);
+//gdk_export gdk_return 

MonetDB: string_imprints - Handle ignored bytes correctly

2021-03-01 Thread Panagiotis Koutsourakis
Changeset: e752aa525361 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e752aa525361
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Handle ignored bytes correctly


diffs (35 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -93,7 +93,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
 gdk_return
 GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*count)
 {
-   lng t0;
+   lng t0=0;
size_t hi;
BUN i;
BATiter bi;
@@ -111,7 +111,21 @@ GDKstrimp_makehistogram(BAT *b, uint64_t
s = (char *)BUNtvar(bi, i);
if (!strNil(s)) {
for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) {
-   if (isNotIgnored(*ptr) && 
isNotIgnored(*(ptr+1))) {
+   if (isIgnored(*(ptr+1))) {
+   /* Skip this and the next pair
+* if the next char is ignored.
+*/
+   ptr++;
+   }
+   else if (isIgnored(*ptr)) {
+   /* Skip this pair if the current
+* char is ignored. This should
+* only happen at the beginnig
+* of a string.
+*/
+   ;
+   }
+   else {
hi = pairToIndex(*(ptr), *(ptr+1));
assert(hi < hist_size);
if (hist[hi] == 0)
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - First implementation of strimp header...

2021-03-01 Thread Panagiotis Koutsourakis
Changeset: 57ba6f8b90aa for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=57ba6f8b90aa
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

First implementation of strimp header contruction


diffs (212 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -84,7 +84,56 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
 #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
 #define isNotIgnored(x) (!isIgnored(x))
 #define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2))
+#define indexToPair1(idx) (idx & 0xff00) >> 8
+#define indexToPair2(idx) (idx & 0xff)
+#define swp(_a, _i, _j, TPE)   \
+   do {\
+   TPE _t = ((TPE *)_a)[_i];   \
+   ((TPE *) _a)[_i] = ((TPE *) _a)[_j];\
+   ((TPE *) _a)[_j] = _t;  \
+   } while(0)
 
+static StrimpHeader *
+make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size)
+{
+   lng t0 = 0;
+   size_t i;
+   uint64_t max_counts[STRIMP_SIZE] = {0};
+   const size_t cmin_max = STRIMP_SIZE - 1;
+   size_t hidx;
+
+   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+
+   for(i = 0; i < STRIMP_SIZE; i++)
+   h->bytepairs[i] = 0;
+
+   for(i = 0; i < hist_size; i++) {
+   if (max_counts[cmin_max] < hist[i]) {
+   max_counts[cmin_max] = hist[i];
+   h->bytepairs[cmin_max] = i;
+for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > 
max_counts[hidx-1]; hidx--) {
+   swp(max_counts, hidx, hidx-1, uint64_t);
+   swp(h->bytepairs, hidx, hidx-1, uint16_t);
+   }
+   }
+   }
+
+   for(i = 0; i < STRIMP_SIZE; i++) {
+   TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), 
indexToPair2(h->bytepairs[i]), max_counts[i]);
+   }
+
+   TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0);
+
+   return h;
+}
+
+
+/* static uint64_t */
+/* add_to_header(size_t idx, uint64_t count) */
+/* { */
+/* while */
+/* return GDK_SUCCEED; */
+/* } */
 /* Construct a histogram of pairs of bytes.
  *
  * Return the histogram in hist and the number of non-zero bins in
@@ -98,6 +147,7 @@ GDKstrimp_make_histogram(BAT *b, uint64_
BUN i;
BATiter bi;
char *ptr, *s;
+   /* uint64_t cur_min = 0; */
 
TRC_DEBUG_IF(ALGO) t0 = GDKusec();
assert(b->ttype == TYPE_str);
@@ -131,12 +181,61 @@ GDKstrimp_make_histogram(BAT *b, uint64_
if (hist[hi] == 0)
(*nbins)++;
hist[hi]++;
+   /* if (hist[hi] > cur_min) */
+   /*  cur_min = add_to_header(hi, 
hist[hi]); */
}
}
}
}
 
-   TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
+   TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0);
GDKtracer_flush_buffer();
return GDK_SUCCEED;
 }
+
+gdk_return
+GDKstrimp_make_header(BAT *b)
+{
+   uint64_t hist[STRIMP_HISTSIZE] = {0};
+   size_t nbins = 0;
+   StrimpHeader header;
+   if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, ) != 
GDK_SUCCEED) {
+   return GDK_FAIL;
+   }
+
+   make_header(, hist, STRIMP_HISTSIZE);
+
+   return GDK_SUCCEED;
+}
+
+
+/* static uint8_t */
+/* lookup_index(StrimpHeader *h, uint16_t n) */
+/* { */
+/* size_t i; */
+/* for(i = 0; i < STRIMP_SIZE; i++) */
+/* if(h->bytepairs[i] == n) */
+/* return i; */
+
+/* return 0; */
+/* } */
+
+
+/* Given a strimp header and a string compute the bitstring of which
+ * digrams(byte pairs) are present in the string. The strimp header is a
+ * map from digram(byte pair) to index in the strimp.
+ */
+/* static uint64_t */
+/* GDKstrimp_make_bitstring(str s, StrimpHeader *h) */
+/* { */
+/* uint64_t ret = 0; */
+/* uint8_t pair_idx; */
+/* char *it; */
+
+/* for(it = s; *it != 0 && *(it+1) != 0; it++) { */
+/* pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); */
+/* ret |= 0x1 << pair_idx; */
+/* } */
+
+/* return ret; */
+/* } */
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -15,8 +15,15 @@
  * just handling ASCII and full UTF-8 support.
  */
 #define STRIMP_HISTSIZE 256*256
-
-gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
+#define STRIMP_SIZE 64
 
-gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins);
+typedef struct {
+   

MonetDB: string_imprints - Basic correct implementation

2021-03-01 Thread Panagiotis Koutsourakis
Changeset: 2e4b7358231f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2e4b7358231f
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Basic correct implementation

Do not miss anything, do not allow ignored characters.


diffs (64 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -81,16 +81,25 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
  * non-ASCII codepoints that are considered spaces, for example the
  * codepoints in the range U+2000-U+200f.
  */
-#define isIgnored(x) isspace((x)) || isdigit((x))
-#define pairToIndex(b1, b2) (b1)<<8 | (b2)
+#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
+#define isNotIgnored(x) (!isIgnored(x))
+#define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2))
 
+/* Construct a histogram of pairs of bytes.
+ *
+ * Return the histogram in hist and the number of non-zero bins in
+ * count.
+ */
 gdk_return
 GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*count)
 {
+   lng t0;
size_t hi;
BUN i;
BATiter bi;
-   uint8_t *ptr, *s;
+   char *ptr, *s;
+
+   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
assert(b->ttype == TYPE_str);
 
for(hi = 0; hi < hist_size; hi++)
@@ -99,18 +108,21 @@ GDKstrimp_makehistogram(BAT *b, uint64_t
bi = bat_iterator(b);
*count = 0;
for(i = 0; i < b->batCount; i++) {
-   s = (uint8_t *)BUNtail(bi, i);
-   for(ptr = s; *(ptr + 1) != 0; ptr++) {
-   if (isIgnored(*ptr)) /* skip the current pair and the 
next at the end of the loop */
-   ptr++;
-   else {
-   hi = pairToIndex(*(ptr), *(ptr+1));
-   assert(hi < hist_size);
-   if (hist[hi] == 0)
-   (*count)++;
-   hist[hi]++;
+   s = (char *)BUNtvar(bi, i);
+   if (!strNil(s)) {
+   for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) {
+   if (isNotIgnored(*ptr) && 
isNotIgnored(*(ptr+1))) {
+   hi = pairToIndex(*(ptr), *(ptr+1));
+   assert(hi < hist_size);
+   if (hist[hi] == 0)
+   (*count)++;
+   hist[hi]++;
+   }
}
}
}
+
+   TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
+   GDKtracer_flush_buffer();
return GDK_SUCCEED;
 }
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Improve names

2021-03-01 Thread Panagiotis Koutsourakis
Changeset: 950d5acff27f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=950d5acff27f
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Improve names


diffs (53 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -91,7 +91,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
  * count.
  */
 gdk_return
-GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*count)
+GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*nbins)
 {
lng t0=0;
size_t hi;
@@ -106,7 +106,7 @@ GDKstrimp_makehistogram(BAT *b, uint64_t
hist[hi] = 0;
 
bi = bat_iterator(b);
-   *count = 0;
+   *nbins = 0;
for(i = 0; i < b->batCount; i++) {
s = (char *)BUNtvar(bi, i);
if (!strNil(s)) {
@@ -129,7 +129,7 @@ GDKstrimp_makehistogram(BAT *b, uint64_t
hi = pairToIndex(*(ptr), *(ptr+1));
assert(hi < hist_size);
if (hist[hi] == 0)
-   (*count)++;
+   (*nbins)++;
hist[hi]++;
}
}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -17,6 +17,6 @@
 #define STRIMP_HISTSIZE 256*256
 
 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
-gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *count);
 
+gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins);
 #endif /* _GDK_STRIMPS_H_ */
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -380,7 +380,7 @@ CMDstrimp_makehist(Client cntxt, MalBlkP
if ((b = BATdescriptor(bid)) == NULL)
throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
-   if (!GDKstrimp_makehistogram(b, hist, STRIMP_HISTSIZE, )) {
+   if (!GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, )) {
throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
}
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Count the total number of digrams in ...

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: fabfd34343c3 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fabfd34343c3
Added Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
Modified Files:
gdk/CMakeLists.txt
gdk/gdk.h
monetdb5/modules/mal/01_calc.mal
monetdb5/modules/mal/batcalc.c
Branch: string_imprints
Log Message:

Count the total number of digrams in a string bat


diffs (190 lines):

diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt
--- a/gdk/CMakeLists.txt
+++ b/gdk/CMakeLists.txt
@@ -78,6 +78,7 @@ target_sources(bat
   gdk_analytic_func.c
   gdk_analytic.h
   gdk_tracer.c gdk_tracer.h
+  gdk_strimps.c gdk_strimps.h
   PUBLIC
   ${gdk_public_headers})
 
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2113,4 +2113,9 @@ gdk_export BAT *BATsample_with_seed(BAT 
  */
 #define MAXPARAMS  32
 
+/*
+ * String Imprints Development/Testing. TODO: remove the following.
+ */
+
+#include "gdk_strimps.h"
 #endif /* _GDK_H_ */
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_strimps.c
@@ -0,0 +1,55 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+#include "gdk_private.h"
+
+/* This counts how many unicode codepoints the given string
+ * contains.
+ */
+static size_t
+GDKstrimp_strlen(const char *s)
+{
+   return strlen(s);
+}
+
+/* Given a BAT return the number of digrams in it. The observation is
+ * that the number of digrams is the number of characters - 1:
+ *
+ * 1 digram starting at character 1
+ * 1 digram starting at character 2
+ * [...]
+ * 1 digram starting at character n - 1
+ */
+gdk_return
+GDKstrimp_ndigrams(BAT *b, size_t *n)
+{
+   // lng t0;
+   BUN i;
+   BATiter bi;
+   char *s;
+   // GDKtracer_set_component_level("ALGO", "DEBUG");
+   // struct canditer ci;
+
+   // t0 = GDKusec();
+   // BATcheck(b, NULL);
+   assert(b->ttype == TYPE_str);
+
+   bi = bat_iterator(b);
+   *n = 0;
+   for (i = 0; i < b->batCount; i++) {
+   s = (char *)BUNtail(bi, i);
+*n += GDKstrimp_strlen(s) - 1;
+   // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, (char *)BUNtail(bi, 
i));
+   }
+
+   // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
+
+   return GDK_SUCCEED;
+}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_strimps.h
@@ -0,0 +1,27 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V.
+ */
+
+#ifndef _GDK_STRIMPS_H_
+#define _GDK_STRIMPS_H_
+
+#include 
+
+#define HISTSIZE 64
+
+typedef struct {
+   uint64_t counts[HISTSIZE];
+   char foo;
+} Histogram;
+
+typedef struct {
+   Histogram* hist;
+} Strimp;
+
+gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
+
+#endif /* _GDK_STRIMPS_H_ */
diff --git a/monetdb5/modules/mal/01_calc.mal b/monetdb5/modules/mal/01_calc.mal
--- a/monetdb5/modules/mal/01_calc.mal
+++ b/monetdb5/modules/mal/01_calc.mal
@@ -5593,3 +5593,9 @@ comment "Calculate aggregate string conc
 pattern 
str_group_concat(b:bat[:str],sep:bat[:str],s:bat[:oid],nil_if_empty:bit) :str
 address CMDBATstr_group_concat
 comment "Calculate aggregate string concatenate of B with candidate list and 
separator SEP.";
+
+
+# foo
+pattern str_iterate_bat(b:bat[:str]) :void;
+address CMDstr_iterate_bat
+comment "iterate through a bat";
diff --git a/monetdb5/modules/mal/batcalc.c b/monetdb5/modules/mal/batcalc.c
--- a/monetdb5/modules/mal/batcalc.c
+++ b/monetdb5/modules/mal/batcalc.c
@@ -1368,6 +1368,39 @@ CMDifthen(Client cntxt, MalBlkPtr mb, Ma
return MAL_SUCCEED;
 }
 
+
+/*
+ * String imprints dev/testing. TODO: remove.
+ */
+static str
+CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+   bat bid;
+   BAT *b;
+   size_t n;
+
+   (void)cntxt;
+   (void)mb;
+
+   // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED);
+   bid = *getArgReference_bat(stk, pci, 1);
+   if ((b = BATdescriptor(bid)) == NULL)
+   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+
+   if (!GDKstrimp_ndigrams(b, )) {
+   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
+   }
+
+   *getArgReference_lng(stk, pci, 0) = n;
+
+   return MAL_SUCCEED;
+}
+
+
+/*
+ * String imprints dev/testing. TODO: end remove.
+ */
+
 #include "mel.h"
 
 static str
@@ -2187,7 +2220,17 @@ static mel_func 

MonetDB: string_imprints - Merge branch 'master' into branches/s...

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: c5599a533197 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c5599a533197
Branch: string_imprints
Log Message:

Merge branch 'master' into branches/string_imprints


diffs (truncated from 13685 to 300 lines):

diff --git a/ctest/tools/monetdbe/CMakeLists.txt 
b/ctest/tools/monetdbe/CMakeLists.txt
--- a/ctest/tools/monetdbe/CMakeLists.txt
+++ b/ctest/tools/monetdbe/CMakeLists.txt
@@ -20,12 +20,6 @@ target_link_libraries(example2
 monetdbe)
 add_test(run_example2 example2)
 
-add_executable(example_proxy example_proxy.c)
-target_link_libraries(example_proxy
-  PRIVATE
-monetdb_config_header
-monetdbe)
-
 add_executable(example_temporal example_temporal.c)
 target_link_libraries(example_temporal
   PRIVATE
@@ -77,12 +71,12 @@ target_link_libraries(example_connection
 monetdbe)
 add_test(run_example_connections example_connections)
 
-add_executable(example_remote example_remote.c)
-target_link_libraries(example_remote
+add_executable(example_proxy example_proxy.c)
+target_link_libraries(example_proxy
   PRIVATE
 monetdb_config_header
 monetdbe)
-add_test(run_example_remote example_remote)
+add_test(run_example_proxy example_proxy)
 
 if(WITH_CMOCKA)
   add_executable(cmocka_test cmocka_test.c test_helper.c)
@@ -95,3 +89,9 @@ if(WITH_CMOCKA)
   )
   add_test(run_cmocka_test cmocka_test)
 endif()
+
+if (TESTING)
+  install(TARGETS
+example_proxy
+DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/ctest/tools/monetdbe/Tests/All b/ctest/tools/monetdbe/Tests/All
new file mode 100644
--- /dev/null
+++ b/ctest/tools/monetdbe/Tests/All
@@ -0,0 +1,1 @@
+example_proxy
diff --git a/ctest/tools/monetdbe/Tests/example_proxy.SQL.py 
b/ctest/tools/monetdbe/Tests/example_proxy.SQL.py
new file mode 100644
--- /dev/null
+++ b/ctest/tools/monetdbe/Tests/example_proxy.SQL.py
@@ -0,0 +1,45 @@
+import os, pymonetdb
+import subprocess
+
+db = os.getenv("TSTDB")
+port = os.getenv("MAPIPORT")
+
+client1 = pymonetdb.connect(database=db, port=port, autocommit=True)
+cur1 = client1.cursor()
+cur1.execute('''
+CREATE TABLE test (x INTEGER, y STRING);
+INSERT INTO test VALUES (42, 'Hello'), (NULL, 'World');
+''')
+
+cur1.close()
+client1.close()
+
+cmd = ['example_proxy', port, db]
+results = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, 
encoding='utf-8')
+
+if results.stderr:
+print(results.stderr)
+
+lines = results.stdout.splitlines()
+
+if len(lines) != 3:
+print(results.stdout)
+print("Too many output lines.")
+exit(1)
+
+def test_equal(expected, received):
+if received != expected:
+print("expected:")
+print(expected)
+print("received:")
+print(received)
+exit(1)
+
+expected="Query result with 2 cols and 2 rows"
+test_equal(expected, lines[0])
+
+expected="42, Hello"
+test_equal(expected, lines[1])
+
+expected="NULL, World"
+test_equal(expected, lines[2])
diff --git a/ctest/tools/monetdbe/example_proxy.c 
b/ctest/tools/monetdbe/example_proxy.c
--- a/ctest/tools/monetdbe/example_proxy.c
+++ b/ctest/tools/monetdbe/example_proxy.c
@@ -11,63 +11,34 @@
 #include 
 #include 
 
+#define expected_error(msg) {fprintf(stderr, "Failure: %s\n", msg); return 0;}
 #define error(msg) {fprintf(stderr, "Failure: %s\n", msg); return -1;}
 
 int
-main(void)
+main(int argc, char** argv)
 {
+   (void) argc;
char* err = NULL;
monetdbe_database mdbe = NULL;
monetdbe_result* result = NULL;
-
-monetdbe_remote remote = {
-.host = "127.0.0.1",
-.port = 50001,
-.username="monetdb",
-.password="monetdb",
-.lang="sql"};
-
-monetdbe_options opt = {.remote = };
+   assert(argc==3);
+   const int port = strtol(argv[1], NULL, 10);
+   const char* database = argv[2];
+   monetdbe_remote remote = {.host="localhost", .port=port, 
.database=database, .username="monetdb", .password="monetdb"};
+   monetdbe_options opts = {.remote = };
 
-   // second argument is a string for the db directory or NULL for 
in-memory mode
-   if (monetdbe_open(, 
"mapi:monetdb://127.0.0.1:5?database=devdb", ))
-   error("Failed to open database")
-
-
-   if ((err = monetdbe_query(mdbe, "DELETE FROM test WHERE x < 0; ", 
, NULL)) != NULL)
+   if (monetdbe_open(, NULL, ))
+   expected_error("Failed to open database")
+   if ((err = monetdbe_query(mdbe, "SELECT x, y FROM test ORDER BY y ASC; 
", , NULL)) != NULL)
error(err)
 
-   if ((err = monetdbe_query(mdbe, "SELECT * FROM test; ", , NULL)) 
!= NULL)
-   error(err)
-
-   monetdbe_column* appendable_columns[2];
-
fprintf(stdout, "Query result with %zu cols and %"PRId64" rows\n", 
result->ncols, result->nrows);
for (int64_t r = 0; r < result->nrows; r++) {
for (size_t c = 0; c < result->ncols; c++) {
monetdbe_column* rcol;
if ((err 

MonetDB: string_imprints - Move MAL code to proper module

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: f6360c814cda for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f6360c814cda
Modified Files:
monetdb5/modules/mal/batExtensions.c
monetdb5/modules/mal/batcalc.c
Branch: string_imprints
Log Message:

Move MAL code to proper module


diffs (155 lines):

diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -337,6 +337,75 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt
return MAL_SUCCEED;
 }
 
+/*
+ * String imprints dev/testing. TODO: remove.
+ */
+static str
+CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+   bat bid;
+   BAT *b;
+   size_t n;
+
+   (void)cntxt;
+   (void)mb;
+
+   // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED);
+   bid = *getArgReference_bat(stk, pci, 1);
+   if ((b = BATdescriptor(bid)) == NULL)
+   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+
+   if (!GDKstrimp_ndigrams(b, )) {
+   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
+   }
+
+   *getArgReference_lng(stk, pci, 0) = n;
+
+   return MAL_SUCCEED;
+}
+
+static str
+CMDstrimp_makehist(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
+   bat bid;
+   BAT *b, *ob;
+   size_t i;
+   uint64_t hist[STRIMP_HISTSIZE];
+   uint16_t count;
+
+   (void)cntxt;
+   (void)mb;
+
+   bid = *getArgReference_bat(stk, pci, 2);
+   if ((b = BATdescriptor(bid)) == NULL)
+   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+
+   if (!GDKstrimp_makehistogram(b, hist, STRIMP_HISTSIZE, )) {
+   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
+   }
+
+   ob = COLnew(0, TYPE_lng, STRIMP_HISTSIZE, TRANSIENT);
+   if (ob == NULL) {
+   throw(MAL, "strimp.makehist", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+   }
+
+   for (i=0; i < STRIMP_HISTSIZE; i++) {
+   if (BUNappend(ob, hist + i, false) != GDK_SUCCEED)
+   throw(MAL, "strimp.makehist", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
+   }
+
+   *getArgReference_bat(stk, pci, 0) = count;
+   // *getArgReference_bat(stk, pci, 1) = ob->batCacheid;
+
+   // BBPkeepref(ob->batCacheid);
+   return MAL_SUCCEED;
+}
+
+/*
+ * String imprints dev/testing. TODO: end remove.
+ */
+
+
 #include "mel.h"
 mel_func batExtensions_init_funcs[] = {
  pattern("bat", "new", CMDBATnew, false, "", args(1,2, 
batargany("",1),argany("tt",1))),
@@ -365,6 +434,17 @@ mel_func batExtensions_init_funcs[] = {
 #endif
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),varargany("ins",1))),
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),batvarargany("ins",1))),
+
+  /*
+  * String imprints dev/testing. TODO: remove.
+  */
+ pattern("bat", "count_digrams", CMDstrimp_ndigrams, false, "count digrams in 
a string bat", args(1, 2, arg("",lng), batarg("b", str))),
+ //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", 
lng), batarg("b", str))),
+ pattern("bat", "make_histogam", CMDstrimp_makehist, false, "make a histogram 
of all the byte pairs in a BAT", args(1, 2, arg("", sht), batarg("b", str))),
+ /*
+  * String imprints dev/testing. TODO: end remove.
+  */
+
  { .imp=NULL }
 };
 #include "mal_import.h"
diff --git a/monetdb5/modules/mal/batcalc.c b/monetdb5/modules/mal/batcalc.c
--- a/monetdb5/modules/mal/batcalc.c
+++ b/monetdb5/modules/mal/batcalc.c
@@ -1368,39 +1368,6 @@ CMDifthen(Client cntxt, MalBlkPtr mb, Ma
return MAL_SUCCEED;
 }
 
-
-/*
- * String imprints dev/testing. TODO: remove.
- */
-static str
-CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-   bat bid;
-   BAT *b;
-   size_t n;
-
-   (void)cntxt;
-   (void)mb;
-
-   // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED);
-   bid = *getArgReference_bat(stk, pci, 1);
-   if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
-
-   if (!GDKstrimp_ndigrams(b, )) {
-   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
-   }
-
-   *getArgReference_lng(stk, pci, 0) = n;
-
-   return MAL_SUCCEED;
-}
-
-
-/*
- * String imprints dev/testing. TODO: end remove.
- */
-
 #include "mel.h"
 
 static str
@@ -2221,14 +2188,6 @@ static mel_func batcalc_init_funcs[] = {
  pattern("batcalc", "ifthenelse", CMDifthen, false, "If-then-else operation to 
assemble a conditional result", 

MonetDB: string_imprints - Return the histogram itself along wit...

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: 31582eece4b6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=31582eece4b6
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Return the histogram itself along with the non-zero bin count


diffs (58 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -85,7 +85,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
 #define pairToIndex(b1, b2) (b1)<<8 | (b2)
 
 gdk_return
-GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t 
*count)
+GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t 
*count)
 {
size_t hi;
BUN i;
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -17,6 +17,6 @@
 #define STRIMP_HISTSIZE 256*256
 
 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
-gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t 
hist_size, uint16_t *count);
+gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *count);
 
 #endif /* _GDK_STRIMPS_H_ */
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -371,7 +371,7 @@ CMDstrimp_makehist(Client cntxt, MalBlkP
BAT *b, *ob;
size_t i;
uint64_t hist[STRIMP_HISTSIZE];
-   uint16_t count;
+   size_t count;
 
(void)cntxt;
(void)mb;
@@ -394,10 +394,10 @@ CMDstrimp_makehist(Client cntxt, MalBlkP
throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
}
 
-   *getArgReference_bat(stk, pci, 0) = count;
-   // *getArgReference_bat(stk, pci, 1) = ob->batCacheid;
+   *getArgReference_lng(stk, pci, 0) = count;
+   *getArgReference_bat(stk, pci, 1) = ob->batCacheid;
 
-   // BBPkeepref(ob->batCacheid);
+   BBPkeepref(ob->batCacheid);
return MAL_SUCCEED;
 }
 
@@ -433,7 +433,7 @@ mel_func batExtensions_init_funcs[] = {
 
  /* String imprints */
  pattern("bat", "strimpNDigrams", CMDstrimp_ndigrams, false, "count digrams in 
a string bat", args(1,2,arg("",lng),batarg("b",str))),
- pattern("bat", "strimpHistogam", CMDstrimp_makehist, false, "make a histogram 
of all the byte pairs in a BAT", args(1,2,arg("",lng), batarg("b",str))),
+ pattern("bat", "strimpHistogram", CMDstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), 
batarg("",lng),batarg("b",str))),
  //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", 
lng), batarg("b", str))),
  { .imp=NULL }
 };
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Count utf-8 chars correctly

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: f0e19e88af26 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f0e19e88af26
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Count utf-8 chars correctly


diffs (62 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -14,9 +14,31 @@
  * contains.
  */
 static size_t
-GDKstrimp_strlen(const char *s)
+GDKstrimp_strlen(const uint8_t *s)
 {
-   return strlen(s);
+   size_t ret = 0;
+   size_t i;
+   int m,n;
+   uint8_t c;
+
+   i = 0;
+   while((c = *(s + i)) != 0) {
+   if (c < 0x80)
+   i++;
+   else {
+   for (n = 0, m=0x40; c & m; n++, m >>= 1)
+   ;
+   /* n is now the number of 10xx bytes that should
+  follow. */
+   if (n == 0 || n >= 4)
+   /* TODO: handle invalid utf-8 */
+   {}
+   i += n+1;
+   }
+   ret++;
+   }
+
+   return ret;
 }
 
 /* Given a BAT return the number of digrams in it. The observation is
@@ -33,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
// lng t0;
BUN i;
BATiter bi;
-   char *s;
+   uint8_t *s;
// GDKtracer_set_component_level("ALGO", "DEBUG");
// struct canditer ci;
 
@@ -44,12 +66,13 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
bi = bat_iterator(b);
*n = 0;
for (i = 0; i < b->batCount; i++) {
-   s = (char *)BUNtail(bi, i);
+   s = (uint8_t *)BUNtail(bi, i);
 *n += GDKstrimp_strlen(s) - 1;
-   // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, (char *)BUNtail(bi, 
i));
+   // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
}
 
// TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
+   // GDKtracer_flush_buffer();
 
return GDK_SUCCEED;
 }
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Rename functions for consistency

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: f686c3ba196f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f686c3ba196f
Modified Files:
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Rename functions for consistency


diffs (81 lines):

diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -338,7 +338,7 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt
 }
 
 /*
- * String imprints dev/testing. TODO: remove.
+ * String imprints.
  */
 static str
 CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
@@ -353,10 +353,10 @@ CMDstrimp_ndigrams(Client cntxt, MalBlkP
// return mythrow(MAL, "batcalc.striter", OPERATION_FAILED);
bid = *getArgReference_bat(stk, pci, 1);
if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+   throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
if (!GDKstrimp_ndigrams(b, )) {
-   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
+   throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
}
 
*getArgReference_lng(stk, pci, 0) = n;
@@ -378,20 +378,20 @@ CMDstrimp_makehist(Client cntxt, MalBlkP
 
bid = *getArgReference_bat(stk, pci, 2);
if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
if (!GDKstrimp_makehistogram(b, hist, STRIMP_HISTSIZE, )) {
-   throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
+   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
}
 
ob = COLnew(0, TYPE_lng, STRIMP_HISTSIZE, TRANSIENT);
if (ob == NULL) {
-   throw(MAL, "strimp.makehist", SQLSTATE(HY013) MAL_MALLOC_FAIL);
+   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
}
 
for (i=0; i < STRIMP_HISTSIZE; i++) {
if (BUNappend(ob, hist + i, false) != GDK_SUCCEED)
-   throw(MAL, "strimp.makehist", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
+   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
}
 
*getArgReference_bat(stk, pci, 0) = count;
@@ -401,10 +401,6 @@ CMDstrimp_makehist(Client cntxt, MalBlkP
return MAL_SUCCEED;
 }
 
-/*
- * String imprints dev/testing. TODO: end remove.
- */
-
 
 #include "mel.h"
 mel_func batExtensions_init_funcs[] = {
@@ -435,16 +431,10 @@ mel_func batExtensions_init_funcs[] = {
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),varargany("ins",1))),
  pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments 
ins to i", args(1,4, batargany("",1), 
batargany("i",1),arg("force",bit),batvarargany("ins",1))),
 
-  /*
-  * String imprints dev/testing. TODO: remove.
-  */
- pattern("bat", "count_digrams", CMDstrimp_ndigrams, false, "count digrams in 
a string bat", args(1, 2, arg("",lng), batarg("b", str))),
+ /* String imprints */
+ pattern("bat", "strimpNDigrams", CMDstrimp_ndigrams, false, "count digrams in 
a string bat", args(1,2,arg("",lng),batarg("b",str))),
+ pattern("bat", "strimpHistogam", CMDstrimp_makehist, false, "make a histogram 
of all the byte pairs in a BAT", args(1,2,arg("",lng), batarg("b",str))),
  //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a 
histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", 
lng), batarg("b", str))),
- pattern("bat", "make_histogam", CMDstrimp_makehist, false, "make a histogram 
of all the byte pairs in a BAT", args(1, 2, arg("", sht), batarg("b", str))),
- /*
-  * String imprints dev/testing. TODO: end remove.
-  */
-
  { .imp=NULL }
 };
 #include "mal_import.h"
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Byte pair histogram construction

2021-02-23 Thread Panagiotis Koutsourakis
Changeset: 4f3cbb1ef6c7 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f3cbb1ef6c7
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
Branch: string_imprints
Log Message:

Byte pair histogram construction

Count the occurrences of pairs of bytes. This is different than
counting pairs of characters, unless the characters are ASCII.


diffs (71 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -76,3 +76,41 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
 
return GDK_SUCCEED;
 }
+
+/* The isIgnored is a bit suspect in terms of unicode. There are
+ * non-ASCII codepoints that are considered spaces, for example the
+ * codepoints in the range U+2000-U+200f.
+ */
+#define isIgnored(x) isspace((x)) || isdigit((x))
+#define pairToIndex(b1, b2) (b1)<<8 | (b2)
+
+gdk_return
+GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t 
*count)
+{
+   size_t hi;
+   BUN i;
+   BATiter bi;
+   uint8_t *ptr, *s;
+   assert(b->ttype == TYPE_str);
+
+   for(hi = 0; hi < hist_size; hi++)
+   hist[hi] = 0;
+
+   bi = bat_iterator(b);
+   *count = 0;
+   for(i = 0; i < b->batCount; i++) {
+   s = (uint8_t *)BUNtail(bi, i);
+   for(ptr = s; *(ptr + 1) != 0; ptr++) {
+   if (isIgnored(*ptr)) /* skip the current pair and the 
next at the end of the loop */
+   ptr++;
+   else {
+   hi = pairToIndex(*(ptr), *(ptr+1));
+   assert(hi < hist_size);
+   if (hist[hi] == 0)
+   (*count)++;
+   hist[hi]++;
+   }
+   }
+   }
+   return GDK_SUCCEED;
+}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -11,17 +11,12 @@
 
 #include 
 
-#define HISTSIZE 64
-
-typedef struct {
-   uint64_t counts[HISTSIZE];
-   char foo;
-} Histogram;
-
-typedef struct {
-   Histogram* hist;
-} Strimp;
+/* Count the occurences of pairs of bytes. This is a compromise between
+ * just handling ASCII and full UTF-8 support.
+ */
+#define STRIMP_HISTSIZE 256*256
 
 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n);
+gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t 
hist_size, uint16_t *count);
 
 #endif /* _GDK_STRIMPS_H_ */
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-04-07 Thread Panagiotis Koutsourakis
Changeset: 0ce20141e77a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0ce20141e77a
Modified Files:
gdk/CMakeLists.txt
gdk/gdk.h
gdk/gdk_private.h
gdk/gdk_strimps.c
monetdb5/modules/mal/01_calc.mal
monetdb5/modules/mal/batExtensions.c
monetdb5/modules/mal/batcalc.c
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 418839 to 300 lines):

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md 
b/.github/ISSUE_TEMPLATE/bug_report.md
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -11,9 +11,9 @@ assignees: ''
 A clear and concise description of what the bug is.
 
 **To Reproduce**
-Create a setting with minimal input for an external user to demonstrate him 
the buggy behavior.
-This includes the relevant part of the database  schema description.
-Performance trace of the roque query (using the TRACE command)
+Create a setting with minimal input for an external user to demonstrate the 
buggy behavior.
+This includes the relevant part of the database schema description.
+Performance trace of the rogue query (using the TRACE command)
 
 **Expected behavior**
 A clear and concise description of what you expected to happen.
diff --git a/.hgtags b/.hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -798,3 +798,5 @@ 929f5e280bc1532a2bfaab127ca7915dc3b69a33
 742b7847cfdcea39a6c19ab29eb35471d46bb2bb Oct2020_SP2_release
 17d27ad30941c81e4bc700300912e84e9b9a8c37 Oct2020_13
 17d27ad30941c81e4bc700300912e84e9b9a8c37 Oct2020_SP3_release
+6b71a8cc3498561815ac88d6c652922359efd13a Oct2020_15
+6b71a8cc3498561815ac88d6c652922359efd13a Oct2020_SP4_release
diff --git a/MonetDB.spec b/MonetDB.spec
--- a/MonetDB.spec
+++ b/MonetDB.spec
@@ -84,7 +84,7 @@ Group: Applications/Databases
 License: MPLv2.0
 URL: https://www.monetdb.org/
 BugURL: https://bugs.monetdb.org/
-Source: 
https://www.monetdb.org/downloads/sources/Oct2020-SP3/%{name}-%{version}.tar.bz2
+Source: 
https://www.monetdb.org/downloads/sources/Oct2020-SP4/%{name}-%{version}.tar.bz2
 
 # The Fedora packaging document says we need systemd-rpm-macros for
 # the _unitdir and _tmpfilesdir macros to exist; however on RHEL 7
@@ -302,8 +302,8 @@ This package contains the files needed t
 Summary: MonetDB ODBC driver
 Group: Applications/Databases
 Requires: %{name}-client%{?_isa} = %{version}-%{release}
-Requires(post): unixODBC
-Requires(postun): unixODBC
+Requires(post): %{_bindir}/odbcinst
+Requires(postun): %{_bindir}/odbcinst
 
 %description client-odbc
 MonetDB is a database management system that is developed from a
@@ -828,6 +828,48 @@ else
 fi
 
 %changelog
+* Fri Apr 02 2021 Sjoerd Mullender  - 11.39.15-20210402
+- Rebuilt.
+- GH#6786: function json.isvalid(js json) is not useful, could be removed
+- GH#7016: Database crashes when use similarity function on a table with
+  more than 200k records
+- GH#7037: Clearer err msg for ALTER USER with insufficient privileges
+- GH#7042: AddressSanitizer:DEADLYSIGNAL in Oct2020/gdk/gdk_tracer.c:494
+- GH#7050: file descriptor leak when forward=redirect
+- GH#7057: ODBC driver installer on Windows is missing some DLLs
+- GH#7058: MonetDBe: COPY INTO csv file does not produce any output
+- GH#7059: MonetDBe: 'reverse' C UDF crashes
+- GH#7061: Have bulk load support combined gzip files
+- GH#7064: Temporary hashes created in hash and unique logic should try to
+  use transient data farm first
+- GH#7066: percent_rank function with wrong results
+- GH#7070: double free error when running MonetDBe Example
+- GH#7076: mserver5 ignores memory.low from cgroups v2
+- GH#7077: Oct2020: new default privileges not effectively communicated
+- GH#7083: MonetDBe C++ Compiling Error
+- GH#7085: Mitosis and filter functions
+- GH#7087: SIGSEGV caused by error in subquery's function being ignored by
+  top-level query
+- GH#7089: Data consistency problem of query results in the latest release
+  of Monetdb (Remote Table)
+
+* Wed Mar 31 2021 Sjoerd Mullender  - 11.39.15-20210402
+- odbc: When connecting using a DSN (Data Source Name), information about the
+  data source is retrieved from the ODBC.INI file.  Now we also get the
+  location of the LOGFILE from this file.  The logfile can be used to
+  log all calls to the MonetDB ODBC driver to a file which can be used
+  for debugging.
+
+* Thu Mar 25 2021 Sjoerd Mullender  - 11.39.15-20210402
+- odbc: The ODBC driver now only passes on information about HUGEINT columns
+  as HUGEINT when the application has indicated interest by querying
+  about the SQL_HUGEINT extension type using the SQLGetTypeInfo
+  function or by specifying the type in a call to SQLSetDescField.
+  Otherwise the driver silently translates the HUGEINT type to BIGINT.
+  This means that most application will see BIGINT columns when the
+  server produced a HUGEINT column and only give an error if the value
+  in the HUGEINT column didn't fit into a BIGINT.
+
 * Thu Feb 11 2021 Sjoerd Mullender  - 

MonetDB: string_imprints - Fix header construction

2021-04-07 Thread Panagiotis Koutsourakis
Changeset: e332f5015f9c for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e332f5015f9c
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix header construction


diffs (38 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -136,9 +136,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
  */
 #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
 #define isNotIgnored(x) (!isIgnored(x))
-#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b1)<<8 | ((uint8_t)b2))
-#define indexToPair1(idx) (idx & 0xff00) >> 8
-#define indexToPair2(idx) (idx & 0xff)
+#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b2)<<8 | ((uint8_t)b1))
+#define indexToPair2(idx) (idx & 0xff00) >> 8
+#define indexToPair1(idx) (idx & 0xff)
 #define swp(_a, _i, _j, TPE)   \
do {\
TPE _t = ((TPE *)_a)[_i];   \
@@ -330,9 +330,9 @@ create_strimp_heap(BAT *b, StrimpHeader 
Heap *r = NULL;
uint64_t *d;
uint64_t descriptor;
-   uint8_t npairs, bytes_per_pair;
-   uint16_t hsize;
-   size_t i,j;
+   uint64_t npairs, bytes_per_pair, hsize;
+   size_t i;
+   int j;
const char *nme;
 
nme = GDKinmemory(b->theap->farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
@@ -362,7 +362,7 @@ create_strimp_heap(BAT *b, StrimpHeader 
 */
for(i = 0; i < STRIMP_HEADER_SIZE; i += 4) {
*d = 0;
-   for(j = 0; j < 4; j++) {
+   for(j = 3; j >= 0; j--) {
*d <<= 16;
*d |= h->bytepairs[i + j];
}
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: default - Add lock github bot configuration

2021-04-19 Thread Panagiotis Koutsourakis
Changeset: 6fc05424ec63 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/6fc05424ec63
Added Files:
.github/lock.yml
Branch: default
Log Message:

Add lock github bot configuration


diffs (43 lines):

diff --git a/.github/lock.yml b/.github/lock.yml
new file mode 100644
--- /dev/null
+++ b/.github/lock.yml
@@ -0,0 +1,38 @@
+# Configuration for Lock Threads - https://github.com/dessant/lock-threads-app
+
+# Number of days of inactivity before a closed issue or pull request is locked
+daysUntilLock: 20
+
+# Skip issues and pull requests created before a given timestamp. Timestamp 
must
+# follow ISO 8601 (`-MM-DD`). Set to `false` to disable
+skipCreatedBefore: false
+
+# Issues and pull requests with these labels will be ignored. Set to `[]` to 
disable
+exemptLabels: []
+
+# Label to add before locking, such as `outdated`. Set to `false` to disable
+lockLabel: false
+
+# Comment to post before locking. Set to `false` to disable
+lockComment: >
+  This thread has been automatically locked since there has not been
+  any recent activity after it was closed. Please open a new issue for
+  related bugs.
+
+# Assign `resolved` as the reason for locking. Set to `false` to disable
+setLockReason: false
+
+# Limit to only `issues` or `pulls`
+# only: issues
+
+# Optionally, specify configuration settings just for `issues` or `pulls`
+# issues:
+#   exemptLabels:
+# - help-wanted
+#   lockLabel: outdated
+
+# pulls:
+#   daysUntilLock: 30
+
+# Repository to extend settings from
+# _extends: repo
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Do not try to generate strimp if it's...

2021-04-09 Thread Panagiotis Koutsourakis
Changeset: 76f731444d7c for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/76f731444d7c
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Do not try to generate strimp if it's already constructed


diffs (65 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -392,35 +392,37 @@ GDKstrimp_create_strimp(BAT *b)
assert(b->ttype == TYPE_str);
TRC_DEBUG_IF(ALGO) t0 = GDKusec();
 
-   if ((head = create_header(b)) == NULL) {
-   return GDK_FAIL;
-   }
+   if (b->tstrimps == NULL) {
+   if ((head = create_header(b)) == NULL) {
+   return GDK_FAIL;
+   }
 
-   if ((h = create_strimp_heap(b, head)) == NULL) {
-   GDKfree(head);
-   return GDK_FAIL;
-   }
-   dh = (uint64_t *)h->base + h->free;
+   if ((h = create_strimp_heap(b, head)) == NULL) {
+   GDKfree(head);
+   return GDK_FAIL;
+   }
+   dh = (uint64_t *)h->base + h->free;
 
-   bi = bat_iterator(b);
-   for (i = 0; i < b->batCount; i++) {
-   s = (str)BUNtvar(bi, i);
-   if (!strNil(s))
-   *dh++ = GDKstrimp_make_bitstring(s, head);
-   else
-   *dh++ = 0; /* no pairs in nil values */
+   bi = bat_iterator(b);
+   for (i = 0; i < b->batCount; i++) {
+   s = (str)BUNtvar(bi, i);
+   if (!strNil(s))
+   *dh++ = GDKstrimp_make_bitstring(s, head);
+   else
+   *dh++ = 0; /* no pairs in nil values */
 
+   }
+
+   /* After we have computed the strimp, attempt to write it back
+* to the BAT.
+*/
+   MT_lock_set(>batIdxLock);
+   b->tstrimps = h;
+   b->batDirtydesc = true;
+   /* persistStrimp(b) */
+   MT_lock_unset(>batIdxLock);
}
 
-   /* After we have computed the strimp, attempt to write it back
-* to the BAT.
-*/
-   MT_lock_set(>batIdxLock);
-   b->tstrimps = h;
-   b->batDirtydesc = true;
-   /* persistStrimp(b) */
-   MT_lock_unset(>batIdxLock);
-
TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0);
return GDK_SUCCEED;
 }
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: default - Show more information if Mz.py is not able to...

2021-04-09 Thread Panagiotis Koutsourakis
Changeset: 8088c952798c for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/8088c952798c
Modified Files:
testing/Mz.py.in
Branch: default
Log Message:

Show more information if Mz.py is not able to start the server


diffs (14 lines):

diff --git a/testing/Mz.py.in b/testing/Mz.py.in
--- a/testing/Mz.py.in
+++ b/testing/Mz.py.in
@@ -900,8 +900,8 @@ def GetBitsAndModsAndThreads(env) :
 if proc.returncode is None:
 killProc(proc, proc.stderr, cmd)
 proc.wait()
-if procdebug:
-print('GetBitsAndModsAndThreads: process exited "%s" (%s)\n' % 
('" "'.join(cmd), proc.returncode))
+if procdebug or proc.returncode != 0:
+print('GetBitsAndModsAndThreads: process exited "%s" (%s)\n' % 
('" "'.join(cmd), proc.returncode), file=sys.stderr)
 env['TST_MODS'] = []
 env['TST_BITS'] = ""
 env['TST_INT128'] = ""
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Create string imprints branch

2021-02-16 Thread Panagiotis Koutsourakis
Changeset: 6e9b8a1f0fc8 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6e9b8a1f0fc8
Branch: string_imprints
Log Message:

Create string imprints branch

___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: default - Fix JSON parsing of exponents

2021-02-17 Thread Panagiotis Koutsourakis
Changeset: 11b6539611d7 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=11b6539611d7
Modified Files:
monetdb5/modules/atoms/json.c
Branch: default
Log Message:

Fix JSON parsing of exponents

+ is an acceptable char in an exponent, but a digit needs to be present.


diffs (35 lines):

diff --git a/monetdb5/modules/atoms/json.c b/monetdb5/modules/atoms/json.c
--- a/monetdb5/modules/atoms/json.c
+++ b/monetdb5/modules/atoms/json.c
@@ -918,17 +918,29 @@ JSONfractionParser(const char *j, const 
 
 static bool
 JSONexponentParser(const char *j, const char **next) {
+   const char *s = j;
+   bool saw_digit = false;
+
if (*j != 'e' && *j != 'E') {
return false;
}
 
j++;
-   if (*j == '-')
+   if (*j == '-' || *j == '+')
j++;
 
-   for (; *j; j++)
+   for (; *j; j++) {
if (!isdigit((unsigned char)*j))
break;
+   saw_digit = true;
+   }
+
+
+   if (!saw_digit) {
+   j = s;
+   return false;
+   }
+
 
*next = j;
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: default - Make json.isvalid a no-op for json inputs

2021-02-17 Thread Panagiotis Koutsourakis
Changeset: c02fd4fc3853 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c02fd4fc3853
Modified Files:
monetdb5/modules/atoms/json.c
monetdb5/modules/atoms/json.mal
sql/scripts/40_json.sql
Branch: default
Log Message:

Make json.isvalid a no-op for json inputs

Also remove some other unneeded functions.

This fixes #6786


diffs (101 lines):

diff --git a/monetdb5/modules/atoms/json.c b/monetdb5/modules/atoms/json.c
--- a/monetdb5/modules/atoms/json.c
+++ b/monetdb5/modules/atoms/json.c
@@ -400,7 +400,7 @@ JSONstr2json(json *ret, str *j)
 }
 
 static str
-JSONisvalid(bit *ret, json *j)
+JSONisvalid(bit *ret, str *j)
 {
if (strNil(*j)) {
*ret = bit_nil;
@@ -2708,7 +2708,6 @@ static mel_atom json_init_atoms[] = {
 static mel_func json_init_funcs[] = {
  command("json", "new", JSONstr2json, false, "Convert string to its JSON. 
Dealing with escape characters", args(1,2, arg("",json),arg("j",str))),
  command("calc", "json", JSONstr2json, false, "Convert string to its JSON. 
Dealing with escape characters", args(1,2, arg("",json),arg("j",str))),
- command("calc", "json", JSONstr2json, false, "Convert JSON to JSON. Dealing 
with escape characters", args(1,2, arg("",json),arg("j",json))),
  command("json", "str", JSONjson2str, false, "Convert JSON to its string 
equivalent. Dealing with escape characters", args(1,2, 
arg("",str),arg("j",json))),
  command("json", "text", JSONjson2text, false, "Convert JSON values to their 
plain string equivalent.", args(1,2, arg("",str),arg("j",json))),
  command("json", "text", JSONjson2textSeparator, false, "Convert JSON values 
to their plain string equivalent, injecting a separator.", args(1,3, 
arg("",str),arg("j",json),arg("s",str))),
@@ -2728,12 +2727,9 @@ static mel_func json_init_funcs[] = {
  command("json", "filter", JSONfilterArray_hge, false, "", args(1,3, 
arg("",json),arg("name",json),arg("idx",hge))),
  command("json", "filter", JSONfilterArrayDefault_hge, false, "Extract a 
single array element", args(1,4, 
arg("",json),arg("name",json),arg("idx",hge),arg("other",str))),
 #endif
- command("json", "isvalid", JSONisvalid, false, "Validate the string as a 
valid JSON document", args(1,2, arg("",bit),arg("val",json))),
  command("json", "isobject", JSONisobject, false, "Validate the string as a 
valid JSON object", args(1,2, arg("",bit),arg("val",json))),
  command("json", "isarray", JSONisarray, false, "Validate the string as a 
valid JSON array", args(1,2, arg("",bit),arg("val",json))),
  command("json", "isvalid", JSONisvalid, false, "Validate the string as a 
valid JSON document", args(1,2, arg("",bit),arg("val",str))),
- command("json", "isobject", JSONisobject, false, "Validate the string as a 
valid JSON object", args(1,2, arg("",bit),arg("val",str))),
- command("json", "isarray", JSONisarray, false, "Validate the string as a 
valid JSON array", args(1,2, arg("",bit),arg("val",str))),
  command("json", "length", JSONlength, false, "Returns the number of elements 
in the outermost JSON object.", args(1,2, arg("",int),arg("val",json))),
  pattern("json", "unfold", JSONunfold, false, "Expands the outermost JSON 
object into key-value pairs.", args(2,3, 
batarg("k",str),batarg("v",json),arg("val",json))),
  pattern("json", "unfold", JSONunfold, false, "Expands the outermost JSON 
object into key-value pairs.", args(3,4, 
batarg("o",oid),batarg("k",str),batarg("v",json),arg("val",json))),
diff --git a/monetdb5/modules/atoms/json.mal b/monetdb5/modules/atoms/json.mal
--- a/monetdb5/modules/atoms/json.mal
+++ b/monetdb5/modules/atoms/json.mal
@@ -22,10 +22,6 @@ command calc.json(j:str):json
 address JSONstr2json
 comment "Convert string to its JSON. Dealing with escape characters";
 
-command calc.json(j:json):json
-address JSONstr2json
-comment "Convert JSON to JSON. Dealing with escape characters";
-
 command str(j:json):str
 address JSONjson2str
 comment "Convert JSON to its string equivalent. Dealing with escape 
characters";
@@ -72,10 +68,6 @@ command filter(name:json, idx:lng, other
 address JSONfilterArrayDefault_lng
 comment "Extract a single array element";
 
-command isvalid(val:json):bit
-address JSONisvalid
-comment "Validate the string as a valid JSON document";
-
 command isobject(val:json):bit
 address JSONisobject
 comment "Validate the string as a valid JSON object";
@@ -88,14 +80,6 @@ command isvalid(val:str):bit
 address JSONisvalid
 comment "Validate the string as a valid JSON document";
 
-command isobject(val:str):bit
-address JSONisobject
-comment "Validate the string as a valid JSON object";
-
-command isarray(val:str):bit
-address JSONisarray
-comment "Validate the string as a valid JSON array";
-
 command length(val:json):int
 address JSONlength
 comment "Returns the number of elements in the outermost JSON object.";
diff --git a/sql/scripts/40_json.sql b/sql/scripts/40_json.sql
--- a/sql/scripts/40_json.sql
+++ b/sql/scripts/40_json.sql
@@ -40,22 +40,13 @@ create 

MonetDB: default - Ugly hack to fix monetdbe compilation for doc...

2021-02-18 Thread Panagiotis Koutsourakis
Changeset: 119fcfed7edd for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=119fcfed7edd
Modified Files:
common/utils/mcrypt.c
Branch: default
Log Message:

Ugly hack to fix monetdbe compilation for docker pipeline


diffs (22 lines):

diff --git a/common/utils/mcrypt.c b/common/utils/mcrypt.c
--- a/common/utils/mcrypt.c
+++ b/common/utils/mcrypt.c
@@ -45,6 +45,18 @@ mcrypt_getHashAlgorithms(void)
 * desire.
 */
static const char *algorithms =
+   /* When compiling MonetDBe for docker, we use 
-DWITH_CRYPTO=OFF. This means that none of the hashing algorithms
+* are available and so we get a syntax error at 
mcrypt_getHashAlgorithms.
+*
+* This used to compile because it
+* unconditionally included PROT10.
+
+* This hack is dangerous because it will allow MonetDB server 
to be built even without openssl installed. A
+* sever like that will be incompatible with all clients 
because it does not implement MAPI correctly. Ideally
+* we should solve this at CMake level but it is difficult 
because the common modules between MonetDBe and
+* MonetDB server require substantially different compilation 
parameters.
+*/
+   "INVALID"
 #ifdef HAVE_RIPEMD160_UPDATE
",RIPEMD160"
 #endif
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Close the iterators

2021-08-13 Thread Panagiotis Koutsourakis
Changeset: 34549818041a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/34549818041a
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Close the iterators


diffs (68 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -306,6 +306,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
}
}
}
+   bat_iterator_end();
 
// Choose the header pairs
STRMPchoosePairs(hist, hlen, hpairs);
@@ -496,6 +497,7 @@ STRMPfilter(BAT *b, char *q)
return NULL;
 }
 
+#if 0
 static void
 BATstrimpsync(void *arg)
 {
@@ -552,7 +554,6 @@ BATstrimpsync(void *arg)
 static void
 persistStrimp(BAT *b)
 {
-   TRC_DEBUG(ACCELERATOR, "zoo: %d\n", (BBP_status(b->batCacheid) & 
BBPEXISTING));
if((BBP_status(b->batCacheid) & BBPEXISTING)
   && b->batInserted == b->batCount
   && !b->theap->dirty
@@ -567,6 +568,7 @@ persistStrimp(BAT *b)
} else
TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT 
persisting strimp\n", ALGOBATPAR(b));
 }
+#endif
 
 /* Create */
 gdk_return
@@ -587,9 +589,9 @@ STRMPcreate(BAT *b)
return GDK_SUCCEED;
 
/* Disable this before merging to default */
-   if (isVIEW(b)) {
+   if (VIEWtparent(b)) {
assert(b->tstrimps == NULL);
-   b = BBPdescriptor(VIEWtparent(b));
+   b = BBP_cache(VIEWtparent(b));
}
 
if ((h = STRMPcreateStrimpHeap(b)) == NULL) {
@@ -605,6 +607,7 @@ STRMPcreate(BAT *b)
else
*dh++ = 0; /* no pairs in nil values */
}
+   bat_iterator_end();
h->strimps.free += b->batCount*sizeof(uint64_t);
 
 
@@ -621,9 +624,9 @@ STRMPcreate(BAT *b)
/* After we have computed the strimp, attempt to write it back
 * to the BAT.
 */
-   MT_lock_set(>batIdxLock);
-   persistStrimp(b);
-   MT_lock_unset(>batIdxLock);
+   /* MT_lock_set(>batIdxLock); */
+   /* persistStrimp(b); */
+   /* MT_lock_unset(>batIdxLock); */
 
TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", 
GDKusec()-t0);
return GDK_SUCCEED;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Fix strimp creation and filtering

2021-08-13 Thread Panagiotis Koutsourakis
Changeset: 7d48a7a8c479 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/7d48a7a8c479
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix strimp creation and filtering


diffs (161 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -339,7 +339,7 @@ STRMPcreateStrimpHeap(BAT *b)
/* Make sure no other thread got here first */
 if (b->tstrimps == NULL) {
STRMPbuildHeader(b, hpairs); /* Find the header pairs */
-   sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the 
descriptor */
+   sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the 
descriptor and the pair sizes */
for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
sz += hpairs[i].psize;
}
@@ -465,31 +465,42 @@ STRMPfilter(BAT *b, char *q)
BUN i;
uint64_t qbmask;
uint64_t *ptr;
+   Strimps *strmps;
 
-   if (b->tstrimps == NULL)
-   goto sfilter_fail;
+   if (isVIEW(b)) {
+   // b = BBP_cache(VIEWtparent(b));
+   BAT *pb = BBP_cache(VIEWtparent(b));
+   if (!BATcheckstrimps(pb))
+   goto sfilter_fail;
+   strmps = pb->tstrimps;
+   }
+   else {
+   if (!BATcheckstrimps(b))
+   goto sfilter_fail;
+   strmps = b->tstrimps;
+   }
 
r = COLnew(b->hseqbase, TYPE_oid, b->batCount, TRANSIENT);
if (r == NULL) {
goto sfilter_fail;
}
 
-   if (!BATcheckstrimps(b)) {
-   BBPunfix(r->batCacheid);
-   goto sfilter_fail;
-   }
-   qbmask = STRMPmakebitstring(q, b->tstrimps);
-   ptr = (uint64_t *)b->tstrimps->strimps_base;
+   qbmask = STRMPmakebitstring(q, strmps);
+   ptr = (uint64_t *)strmps->strimps_base;
 
for (i = 0; i < b->batCount; i++) {
if ((*(ptr + i) & qbmask) == qbmask) {
-   oid pos = i;
+   oid pos = i + b->hseqbase;
if (BUNappend(r, , false) != GDK_SUCCEED)
goto sfilter_fail;
}
}
 
r->tkey = true;
+   r->tsorted = true;
+   r->trevsorted = BATcount(r) <= 1;
+   r->tnil = false;
+   r->tnonil = true;
return virtualize(r);
 
 
@@ -497,7 +508,6 @@ STRMPfilter(BAT *b, char *q)
return NULL;
 }
 
-#if 0
 static void
 BATstrimpsync(void *arg)
 {
@@ -568,7 +578,8 @@ persistStrimp(BAT *b)
} else
TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT 
persisting strimp\n", ALGOBATPAR(b));
 }
-#endif
+
+static ATOMIC_TYPE STRMPnthread = ATOMIC_VAR_INIT(0);
 
 /* Create */
 gdk_return
@@ -580,27 +591,33 @@ STRMPcreate(BAT *b)
str s;
Strimps *h;
uint64_t *dh;
+   BAT *pb;
 
-   assert(b->ttype == TYPE_str);
TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
-
+   if (b->ttype != TYPE_str) {
+   GDKerror("strimps only valid for strings\n");
+   return GDK_FAIL;
+   }
 
-   if (BATcheckstrimps(b))
+   (void)ATOMIC_INC();
+   /* Disable this before merging to default */
+if (VIEWtparent(b)) {
+   pb = BBP_cache(VIEWtparent(b));
+   assert(pb);
+   } else {
+   pb = b;
+   }
+
+   if (BATcheckstrimps(pb))
return GDK_SUCCEED;
 
-   /* Disable this before merging to default */
-   if (VIEWtparent(b)) {
-   assert(b->tstrimps == NULL);
-   b = BBP_cache(VIEWtparent(b));
-   }
-
-   if ((h = STRMPcreateStrimpHeap(b)) == NULL) {
+if ((h = STRMPcreateStrimpHeap(pb)) == NULL) {
return GDK_FAIL;
}
-   dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free);
+   dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + 
b->hseqbase*8);
 
bi = bat_iterator(b);
-   for (i = 0; i < b->batCount; i++) {
+   for (i = 0; i < bi.count; i++) {
s = (str)BUNtvar(bi, i);
if (!strNil(s))
*dh++ = STRMPmakebitstring(s, h);
@@ -608,25 +625,16 @@ STRMPcreate(BAT *b)
*dh++ = 0; /* no pairs in nil values */
}
bat_iterator_end();
+
+   MT_lock_set(>batIdxLock);
h->strimps.free += b->batCount*sizeof(uint64_t);
-
+   MT_lock_unset(>batIdxLock);
 
-#ifndef NDEBUG
-   {
-   FILE *f = fopen("/tmp/strmp", "wb");
-   if (f) {
-   fwrite(h->strimps.base, 1, h->strimps.free, f);
-   fclose(f);
-   }
+   /* The thread that reaches this point last needs to write the strimp to 
disk. */
+   (void)ATOMIC_DEC();
+   if (STRMPnthread == 

MonetDB: string_imprints - Fix strimp pointers when reading from...

2021-08-13 Thread Panagiotis Koutsourakis
Changeset: 7de11c47ea3d for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/7de11c47ea3d
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix strimp pointers when reading from disk


diffs (23 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -77,8 +77,8 @@
 /* Macros for accessing metadada of a strimp. These are recorded in the
  * first 8 bytes of the heap.
  */
-#define NPAIRS(d) (((d) & (0xff << 8)) >> 8)
-#define HSIZE(d) (((d) & (0x << 15)) >> 15)
+#define NPAIRS(d) ((d) >> 8) & 0xff
+#define HSIZE(d) ((d) >> 16) & 0x
 
 #undef UTF8STRINGS /* Not using utf8 for now */
 #ifdef UTF8STRINGS
@@ -428,7 +428,7 @@ BATcheckstrimps(BAT *b)
&& HEAPload(>strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
hp->pairs_base = hp->sizes_base 
+ npairs; /* pairs just after the offsets */
-   hp->strimps_base = 
hp->sizes_base + hsize;/* bitmasks just after the pairs */
+   hp->strimps_base = 
hp->strimps.base + hsize;/* bitmasks just after the pairs */
 
close(fd);
hp->strimps.parentid = 
b->batCacheid;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-08-13 Thread Panagiotis Koutsourakis
Changeset: 129da86e9686 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/129da86e9686
Modified Files:
gdk/gdk.h
gdk/gdk_bbp.c
gdk/gdk_private.h
monetdb5/modules/mal/batExtensions.c
sql/backends/monet5/sql.c
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 126951 to 300 lines):

diff --git a/clients/ChangeLog b/clients/ChangeLog
--- a/clients/ChangeLog
+++ b/clients/ChangeLog
@@ -1,3 +1,10 @@
 # ChangeLog file for clients
 # This file is updated with Maddlog
 
+* Wed Aug 11 2021 Sjoerd Mullender 
+- A new output formatting mode was added to mclient.  Use -fcsv-noquote
+  to produce a CSV (comma-separated values) output where the quote
+  characters have not been escapes.  This can be useful when producing
+  a single column string output that should be saved as is, e.g. when
+  using the sys.dump_database() function.
+
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -344,7 +344,7 @@ void HASHdestroy(BAT *b);
 BUN HASHlist(Hash *h, BUN i);
 BUN HASHprobe(const Hash *h, const void *v);
 void HEAP_free(Heap *heap, var_t block);
-void HEAP_initialize(Heap *heap, size_t nbytes, size_t nprivate, int 
alignment);
+gdk_return HEAP_initialize(Heap *heap, size_t nbytes, size_t nprivate, int 
alignment);
 var_t HEAP_malloc(BAT *b, size_t nbytes);
 void HEAPdecref(Heap *h, bool remove);
 gdk_return HEAPextend(Heap *h, size_t size, bool mayshare) 
__attribute__((__warn_unused_result__));
diff --git a/clients/mapiclient/mclient.1 b/clients/mapiclient/mclient.1
--- a/clients/mapiclient/mclient.1
+++ b/clients/mapiclient/mclient.1
@@ -204,6 +204,7 @@ The possible values are
 .BR expanded ,
 .BR x ,
 .BR csv ,
+.BR csv-noquote ,
 .BR tab ,
 .BR raw ,
 .BR xml ,
@@ -212,6 +213,8 @@ and
 .BR rowcount .
 .B csv
 is comma-separated values;
+.B csv-noquote
+is comma-separated values without escaping any quotes;
 .B tab
 is tab-separated values;
 .B raw
@@ -235,6 +238,10 @@ and
 is a variation on
 .B trash
 where only the number of affected rows is printed.
+Normal \fBcsv\fP and \fBtab\fP formatting will use double quotes
+around any fields that contain double quotes, white space or the
+separator.  The \fBcsv-noquote\fP format will prevent that and dump
+the contents of the field without any interpretation.
 In addition to plain \fBcsv\fP, two other forms are possible.
 \fBcsv=\fP\fIc\fP uses \fIc\fP as column separator; \fBcsv+\fP\fIc\fP
 uses \fIc\fP as column separator and produces a single header line in
diff --git a/clients/mapiclient/mclient.c b/clients/mapiclient/mclient.c
--- a/clients/mapiclient/mclient.c
+++ b/clients/mapiclient/mclient.c
@@ -95,6 +95,7 @@ enum formatters {
 static enum formatters formatter = NOformatter;
 char *separator = NULL;/* column separator for CSV/TAB format 
*/
 bool csvheader = false;/* include header line in CSV format */
+bool noquote = false;  /* don't use quotes in CSV format */
 
 #define DEFWIDTH 80
 
@@ -942,7 +943,7 @@ CSVrenderer(MapiHdl hdl)
while (!mnstr_errnr(toConsole) && (fields = fetch_row(hdl)) != 0) {
for (i = 0; i < fields; i++) {
s = mapi_fetch_field(hdl, i);
-   if (s != NULL && s[strcspn(s, specials)] != '\0') {
+   if (!noquote && s != NULL && s[strcspn(s, specials)] != 
'\0') {
mnstr_printf(toConsole, "%s\"",
 i == 0 ? "" : separator);
while (*s) {
@@ -1647,6 +1648,7 @@ setFormatter(const char *s)
free(separator);
separator = NULL;
csvheader = false;
+   noquote = false;
 #ifdef _TWO_DIGIT_EXPONENT
if (formatter == TESTformatter)
_set_output_format(0);
@@ -1673,6 +1675,29 @@ setFormatter(const char *s)
} else
separator = strdup(s + 4);
csvheader = true;
+   } else if (strcmp(s, "csv-noquote") == 0) {
+   noquote = true;
+   formatter = CSVformatter;
+   separator = strdup(",");
+   } else if (strncmp(s, "csv-noquote=", 12) == 0) {
+   noquote = true;
+   formatter = CSVformatter;
+   if (s[12] == '"') {
+   separator = strdup(s + 13);
+   if (separator[strlen(separator) - 1] == '"')
+   separator[strlen(separator) - 1] = 0;
+   } else
+   separator = strdup(s + 12);
+   } else if (strncmp(s, "csv-noquote+", 12) == 0) {
+   noquote = true;
+   formatter = CSVformatter;
+   if (s[12] == '"') {
+   separator = strdup(s + 13);
+   if (separator[strlen(separator) - 1] == '"')
+   

MonetDB: string_imprints - Fix strimp persistence

2021-08-13 Thread Panagiotis Koutsourakis
Changeset: b82dc476a040 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/b82dc476a040
Modified Files:
gdk/gdk_bbp.c
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix strimp persistence


diffs (40 lines):

diff --git a/gdk/gdk_bbp.c b/gdk/gdk_bbp.c
--- a/gdk/gdk_bbp.c
+++ b/gdk/gdk_bbp.c
@@ -4038,6 +4038,11 @@ BBPdiskscan(const char *parent, size_t b
 #else
delete = true;
 #endif
+   } else if (strncmp(p + 1, "tstrimps", 8) == 0) {
+   BAT *b = getdesc(bid);
+   delete = b == NULL;
+   if (!delete)
+   b->tstrimps = (Strimps *)1;
} else if (strncmp(p + 1, "new", 3) != 0) {
ok = false;
}
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -78,7 +78,7 @@
  * first 8 bytes of the heap.
  */
 #define NPAIRS(d) (((d) & (0xff << 8)) >> 8)
-#define HSIZE(d) (((d) & (0x << 16)) >> 16)
+#define HSIZE(d) (((d) & (0x << 15)) >> 15)
 
 #undef UTF8STRINGS /* Not using utf8 for now */
 #ifdef UTF8STRINGS
@@ -426,9 +426,9 @@ BATcheckstrimps(BAT *b)
  /* 
bitmasks */
  
BATcount(b)*(npairs/8))
&& HEAPload(>strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
-   hp->sizes_base = (uint8_t *)hp 
+ 8; /* sizes start just after the descriptor */
-   hp->pairs_base = hp->sizes_base 
+ npairs;   /* pairs start after the offsets */
-   hp->strimps_base = 
hp->sizes_base + hsize;  /* bitmasks start after the pairs */
+   hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
+   hp->pairs_base = hp->sizes_base 
+ npairs; /* pairs just after the offsets */
+   hp->strimps_base = 
hp->sizes_base + hsize;/* bitmasks just after the pairs */
 
close(fd);
hp->strimps.parentid = 
b->batCacheid;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Use candidate lists for strimps

2021-08-24 Thread Panagiotis Koutsourakis
Changeset: 3d18e45d5375 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3d18e45d5375
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Use candidate lists for strimps

We use candidate lists both for strimp creation and for filtering.


diffs (136 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -242,20 +242,25 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
lng t0 = 0;
BATiter bi;
str cs;
-   BUN i;
+   BUN i, ncand;
size_t hidx;
+   oid x;
size_t hlen;
PairHistogramElem *hist;
PairIterator pi, *pip;
CharPair cp, *cpp;
+   struct canditer ci;
 
-   (void)s;
 
TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
hlen = STRIMP_HISTSIZE;
if ((hist = (PairHistogramElem 
*)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) {
-   // TODO handle error
-   return 0;
+   return false;
+   }
+
+   ncand = canditer_init(, b, s);
+   if (ncand == 0) {
+   return false;
}
 
for(hidx = 0; hidx < hlen; hidx++) {
@@ -267,8 +272,9 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai
bi = bat_iterator(b);
pip = 
cpp = 
-   for (i = 0; i < b->batCount; i++) {
-   cs = (str)BUNtvar(bi, i);
+   for (i = 0; i < ncand; i++) {
+   x = canditer_next() - b->hseqbase;
+   cs = (str)BUNtvar(bi, x);
if (!strNil(cs)) {
pi.s = cs;
pi.pos = 0;
@@ -339,8 +345,8 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
if (b->tstrimps == NULL) {
MT_lock_set(>batIdxLock);
/* Make sure no other thread got here first */
-if (b->tstrimps == NULL) {
-   STRMPbuildHeader(b, s, hpairs); /* Find the header 
pairs */
+if (b->tstrimps == NULL &&
+   STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, 
put the result in hpairs */
sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the 
descriptor and the pair sizes */
for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
sz += hpairs[i].psize;
@@ -464,14 +470,14 @@ BAT *
 STRMPfilter(BAT *b, BAT *s, char *q)
 {
BAT *r = NULL;
-   BUN i;
+   BUN i, ncand;
uint64_t qbmask;
uint64_t *ptr;
Strimps *strmps;
-   (void)s;
+   oid x;
+   struct canditer ci;
 
if (isVIEW(b)) {
-   // b = BBP_cache(VIEWtparent(b));
BAT *pb = BBP_cache(VIEWtparent(b));
if (!BATcheckstrimps(pb))
goto sfilter_fail;
@@ -483,17 +489,27 @@ STRMPfilter(BAT *b, BAT *s, char *q)
strmps = b->tstrimps;
}
 
-   r = COLnew(b->hseqbase, TYPE_oid, b->batCount, TRANSIENT);
+   ncand = canditer_init(, b, s);
+   if (ncand == 0)
+   /* Is this correct? */
+   return BATdense(b->hseqbase, 0, 0);
+   r = COLnew(b->hseqbase, TYPE_oid, ncand, TRANSIENT);
if (r == NULL) {
goto sfilter_fail;
}
 
+   /* TODO: Compare patterns with and without SQL pattern metachars
+* (% and _). Theoretically they should produce the same results
+* because bitstring creation ignores punctuation characters
+* (see the macro isIgnored).
+*/
qbmask = STRMPmakebitstring(q, strmps);
ptr = (uint64_t *)strmps->strimps_base;
 
-   for (i = 0; i < b->batCount; i++) {
-   if ((*(ptr + i) & qbmask) == qbmask) {
-   oid pos = i + b->hseqbase;
+   for (i = 0; i < ncand; i++) {
+   x = canditer_next() - b->hseqbase;
+   if ((*(ptr + x) & qbmask) == qbmask) {
+   oid pos = x + b->hseqbase;
if (BUNappend(r, , false) != GDK_SUCCEED)
goto sfilter_fail;
}
@@ -590,11 +606,13 @@ STRMPcreate(BAT *b, BAT *s)
 {
lng t0 = 0;
BATiter bi;
-   BUN i;
+   BUN i, ncand;
str cs;
Strimps *h;
uint64_t *dh;
BAT *pb;
+   oid x;
+   struct canditer ci;
 
TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
if (b->ttype != TYPE_str) {
@@ -619,9 +637,12 @@ STRMPcreate(BAT *b, BAT *s)
}
dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + 
b->hseqbase*8);
 
+   ncand = canditer_init(, b, s);
+
bi = bat_iterator(b);
-   for (i = 0; i < bi.count; i++) {
-   cs = (str)BUNtvar(bi, i);
+   for (i = 0; i < ncand; i++) {
+   x = canditer_next() - b->hseqbase;
+   cs = (str)BUNtvar(bi, x);
if (!strNil(cs))
*dh++ = STRMPmakebitstring(cs, h);
   

MonetDB: string_imprints - Allocate a new mal block in optimizer

2021-08-24 Thread Panagiotis Koutsourakis
Changeset: b77f70925fae for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/b77f70925fae
Modified Files:
monetdb5/optimizer/opt_pipes.c
monetdb5/optimizer/opt_strimps.c
Branch: string_imprints
Log Message:

Allocate a new mal block in optimizer

Some fixes due to Martin:

1. Make sure the optimizer needs to run beforehand.
2. Allocate a new mal block and push instructions there.
3. Free the old mal block at the end of the run.


diffs (120 lines):

diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c
--- a/monetdb5/optimizer/opt_pipes.c
+++ b/monetdb5/optimizer/opt_pipes.c
@@ -58,9 +58,9 @@ static struct PIPELINES {
 "optimizer.inline();"
 "optimizer.remap();"
 "optimizer.bincopyfrom();"
-"optimizer.strimps();"
 "optimizer.deadcode();"
 "optimizer.multiplex();"
+"optimizer.strimps();"
 "optimizer.generator();"
 "optimizer.profiler();"
 //"optimizer.candidates();" only for decoration in explain
diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c
--- a/monetdb5/optimizer/opt_strimps.c
+++ b/monetdb5/optimizer/opt_strimps.c
@@ -23,9 +23,9 @@
 str
 OPTstrimpsImplementation(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr 
pci)
 {
-   int i, limit, needed =0, actions=0;
+   int i, limit, slimit, needed =0, actions=0;
// int mvcvar = -1;
-   InstrPtr p,q,r, *old = mb->stmt;
+   InstrPtr p, q, *old = mb->stmt;
char buf[256];
lng usec = GDKusec();
str msg = MAL_SUCCEED;
@@ -37,63 +37,63 @@ OPTstrimpsImplementation(Client cntxt, M
(void) cntxt;
(void) stk; /* to fool compilers */
 
+   limit= mb->stop;
 
if ( mb->inlineProp )
return MAL_SUCCEED;
 
-   // check applicability first
-   for( i=0; i < limit; i++){
+   for(i=0; i < limit; i++) {
p = old[i];
-   if ( getModuleId(p) == algebraRef && getFunctionId(p) == 
likeselectRef)
+   if (getModuleId(p) == algebraRef && getFunctionId(p) == 
likeselectRef)
needed = 1;
}
+
if (!needed)
goto bailout;
 
-   limit= mb->stop;
-   if ( newMalBlkStmt(mb, mb->ssize + 20) < 0)
+   if (newMalBlkStmt(mb, mb->ssize + 20) < 0)
throw(MAL,"optimizer.strimps", SQLSTATE(HY013) MAL_MALLOC_FAIL);
 
+   slimit = mb->stop;
+
for (i = 0; i < limit; i++) {
p = old[i];
-if (p->token == ENDsymbol){
-pushInstruction(mb,p);
-break;
-}
+   if (p->token == ENDsymbol) {
+   pushInstruction(mb,p);
+   break;
+   }
+
/* Look for bind operations on strings, because for those we 
migh need strimps */
 
if (getModuleId(p) == algebraRef && getFunctionId(p) == 
likeselectRef) {
-   q = newInstruction(0, strimpsRef, mkstrimpsRef); /* 
This should be void? */
-   setDestVar(q, newTmpVariable(mb, TYPE_void));
+
+   /* cst.vtype = TYPE_bit; */
+   /* nvar = defConstant(mb, TYPE_bit, ); */
+   q = newInstruction(mb, strimpsRef, 
strimpFilterSelectRef);
+   res = newTmpVariable(mb, newBatType(TYPE_oid));
+   setDestVar(q, res);
q = addArgument(mb, q, getArg(p, 1));
+   q = addArgument(mb, q, getArg(p, 2));
+   q = addArgument(mb, q, getArg(p, 3));
+   q = addArgument(mb, q, getArg(p, 6));
 
pushInstruction(mb, q);
typeChecker(cntxt->usermodule, mb, q, mb->stop-1, TRUE);
 
-   /* cst.vtype = TYPE_bit; */
-   /* nvar = defConstant(mb, TYPE_bit, ); */
-   r = newInstruction(mb, strimpsRef, 
strimpFilterSelectRef);
-   res = newTmpVariable(mb, newBatType(TYPE_oid));
-   setDestVar(r, res);
-   r = addArgument(mb, r, getArg(p, 1));
-   r = addArgument(mb, r, getArg(p, 2));
-   r = addArgument(mb, r, getArg(p, 3));
-   r = addArgument(mb, r, getArg(p, 6));
-
-   pushInstruction(mb, r);
-   // typeChecker(cntxt->usermodule, mb, r, mb->stop-1, 
TRUE);
+   p = setArgument(mb, p, 2, getArg(q, 0));
 
actions++;
}
pushInstruction(mb, p);
}
-   for (; i < slimit; i++)
-   if (old[i])
-   freeInstruction(old[i]);
+   (void)slimit;
+   /* for (; i < slimit; i++) */
+   /*  if (old[i]) */
+   /*  freeInstruction(old[i]); 

MonetDB: string_imprints - strimp creation and filtering should ...

2021-08-24 Thread Panagiotis Koutsourakis
Changeset: 139143c85939 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/139143c85939
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
monetdb5/modules/mal/strimps.c
sql/backends/monet5/sql_strimps.c
Branch: string_imprints
Log Message:

strimp creation and filtering should work with candidates

This commit changes the interface of the GDK functions and how they are called.


diffs (190 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -238,10 +238,10 @@ STRMPchoosePairs(PairHistogramElem *hist
 }
 
 static bool
-STRMPbuildHeader(BAT *b, CharPair *hpairs) {
+STRMPbuildHeader(BAT *b, BAT *s, CharPair *hpairs) {
lng t0 = 0;
BATiter bi;
-   str s;
+   str cs;
BUN i;
size_t hidx;
size_t hlen;
@@ -249,6 +249,8 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
PairIterator pi, *pip;
CharPair cp, *cpp;
 
+   (void)s;
+
TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
hlen = STRIMP_HISTSIZE;
if ((hist = (PairHistogramElem 
*)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) {
@@ -266,9 +268,9 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
pip = 
cpp = 
for (i = 0; i < b->batCount; i++) {
-   s = (str)BUNtvar(bi, i);
-   if (!strNil(s)) {
-   pi.s = s;
+   cs = (str)BUNtvar(bi, i);
+   if (!strNil(cs)) {
+   pi.s = cs;
pi.pos = 0;
pi.lim = strlen(pi.s);
while (pair_at(pip, cpp)) {
@@ -324,7 +326,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
 
 /* Create the heap for a string imprint. Returns NULL on failure. */
 static Strimps *
-STRMPcreateStrimpHeap(BAT *b)
+STRMPcreateStrimpHeap(BAT *b, BAT *s)
 {
uint8_t *h1, *h2;
Strimps *r = NULL;
@@ -338,7 +340,7 @@ STRMPcreateStrimpHeap(BAT *b)
MT_lock_set(>batIdxLock);
/* Make sure no other thread got here first */
 if (b->tstrimps == NULL) {
-   STRMPbuildHeader(b, hpairs); /* Find the header pairs */
+   STRMPbuildHeader(b, s, hpairs); /* Find the header 
pairs */
sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the 
descriptor and the pair sizes */
for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
sz += hpairs[i].psize;
@@ -459,13 +461,14 @@ BATcheckstrimps(BAT *b)
  * list.
  */
 BAT *
-STRMPfilter(BAT *b, char *q)
+STRMPfilter(BAT *b, BAT *s, char *q)
 {
BAT *r = NULL;
BUN i;
uint64_t qbmask;
uint64_t *ptr;
Strimps *strmps;
+   (void)s;
 
if (isVIEW(b)) {
// b = BBP_cache(VIEWtparent(b));
@@ -583,12 +586,12 @@ static ATOMIC_TYPE STRMPnthread = ATOMIC
 
 /* Create */
 gdk_return
-STRMPcreate(BAT *b)
+STRMPcreate(BAT *b, BAT *s)
 {
lng t0 = 0;
BATiter bi;
BUN i;
-   str s;
+   str cs;
Strimps *h;
uint64_t *dh;
BAT *pb;
@@ -611,16 +614,16 @@ STRMPcreate(BAT *b)
if (BATcheckstrimps(pb))
return GDK_SUCCEED;
 
-if ((h = STRMPcreateStrimpHeap(pb)) == NULL) {
+if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) {
return GDK_FAIL;
}
dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + 
b->hseqbase*8);
 
bi = bat_iterator(b);
for (i = 0; i < bi.count; i++) {
-   s = (str)BUNtvar(bi, i);
-   if (!strNil(s))
-   *dh++ = STRMPmakebitstring(s, h);
+   cs = (str)BUNtvar(bi, i);
+   if (!strNil(cs))
+   *dh++ = STRMPmakebitstring(cs, h);
else
*dh++ = 0; /* no pairs in nil values */
}
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -42,6 +42,6 @@ typedef struct {
 // gdk_export gdk_return STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t 
hist_size, size_t *nbins); // make static
 // gdk_export gdk_return STRMP_make_header(StrimpHeader *h, uint64_t *hist, 
size_t hist_size); // make static
 // gdk_export gdk_return STRMP_make_header(BAT *b);
-gdk_export gdk_return STRMPcreate(BAT *b);
-gdk_export BAT *STRMPfilter(BAT *b, char *q);
+gdk_export gdk_return STRMPcreate(BAT *b, BAT *s);
+gdk_export BAT *STRMPfilter(BAT *b, BAT *s, char *q);
 #endif /* _GDK_STRIMPS_H_ */
diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c
--- a/monetdb5/modules/mal/strimps.c
+++ b/monetdb5/modules/mal/strimps.c
@@ -79,8 +79,8 @@ PATstrimp_makehist(Client cntxt, MalBlkP
 static str
 PATstrimpCreate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
 {
-   bat bid;
-   BAT *b;
+   bat bid, sid;
+   BAT *b, *s;

MonetDB: string_imprints - Add logging info to STRMPfilter

2021-08-27 Thread Panagiotis Koutsourakis
Changeset: c70da469b348 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/c70da469b348
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Add logging info to STRMPfilter


diffs (23 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -476,6 +476,9 @@ STRMPfilter(BAT *b, BAT *s, char *q)
Strimps *strmps;
oid x;
struct canditer ci;
+   lng t0 = 0;
+
+   TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
 
if (isVIEW(b)) {
BAT *pb = BBP_cache(VIEWtparent(b));
@@ -520,6 +523,9 @@ STRMPfilter(BAT *b, BAT *s, char *q)
r->trevsorted = BATcount(r) <= 1;
r->tnil = false;
r->tnonil = true;
+   TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT
+ " items took " LLFMT " usec\n", ncand, GDKusec()-t0);
+   TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
return virtualize(r);
 
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Fix initialization check for strimps

2021-08-27 Thread Panagiotis Koutsourakis
Changeset: 238caa07b6bc for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/238caa07b6bc
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix initialization check for strimps

A strimp is initialized if the strimp exists and the number of bitstrings is 
equal to the bat count


diffs (17 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -456,7 +456,12 @@ BATcheckstrimps(BAT *b)
}
MT_lock_unset(>batIdxLock);
 }
-ret = b->tstrimps != NULL;
+   /* The string imprint is initialized if the strimp pointer is
+* not null and the number of bitstrings is equal to the bat
+* count.
+*/
+ret = b->tstrimps != NULL &&
+   (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base 
- b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount;
if (ret)
TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): 
already has strimps, waited " LLFMT " usec\n", ALGOBATPAR(b), GDKusec() - t);
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Treat gdb_use_strimps as an integer

2021-08-27 Thread Panagiotis Koutsourakis
Changeset: 95d06449d0b1 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/95d06449d0b1
Modified Files:
monetdb5/modules/mal/pcre.c
Branch: string_imprints
Log Message:

Treat gdb_use_strimps as an integer


diffs (29 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1873,7 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid,
str msg = MAL_SUCCEED;
char *ppat = NULL;
bool use_re = false, use_strcmp = false, empty = false;
-   bool use_strimps = GDKgetenv("gdk_use_strimps");
+   bool use_strimps = GDKgetenv_int("gdk_use_strimps", 0);
 
if ((b = BATdescriptor(*bid)) == NULL) {
msg = createException(MAL, "algebra.likeselect", 
SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
@@ -1884,6 +1884,8 @@ PCRElikeselect(bat *ret, const bat *bid,
goto bailout;
}
 
+   assert(ATOMstorage(b->ttype) == TYPE_str);
+
if (use_strimps) {
if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
BAT *tmp_s;
@@ -1895,7 +1897,6 @@ PCRElikeselect(bat *ret, const bat *bid,
 
}
 
-   assert(ATOMstorage(b->ttype) == TYPE_str);
if ((msg = choose_like_path(, _re, _strcmp, , pat, 
esc)) != MAL_SUCCEED)
goto bailout;
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-08-31 Thread Panagiotis Koutsourakis
Changeset: 8d90a78cdb68 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/8d90a78cdb68
Modified Files:
gdk/gdk.h
gdk/gdk_bbp.c
gdk/gdk_private.h
monetdb5/optimizer/opt_prelude.c
monetdb5/optimizer/opt_prelude.h
sql/backends/monet5/sql.c
sql/scripts/CMakeLists.txt
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 32345 to 300 lines):

diff --git a/MonetDB.spec b/MonetDB.spec
--- a/MonetDB.spec
+++ b/MonetDB.spec
@@ -527,7 +527,6 @@ exit 0
 %{_libdir}/monetdb5/lib_capi.so
 %endif
 %{_libdir}/monetdb5/lib_generator.so
-%{_libdir}/monetdb5/lib_udf.so
 %doc %{_mandir}/man1/mserver5.1.gz
 %dir %{_datadir}/doc/MonetDB
 %docdir %{_datadir}/doc/MonetDB
@@ -832,6 +831,7 @@ rm -f %{buildroot}%{_libdir}/monetdb5/ru
 rm -f %{buildroot}%{_libdir}/monetdb5/lib_run_*.so
 rm -f %{buildroot}%{_libdir}/monetdb5/microbenchmark.mal
 rm -f %{buildroot}%{_libdir}/monetdb5/lib_microbenchmark*.so
+rm -f %{buildroot}%{_libdir}/monetdb5/lib_udf*.so
 rm -f %{buildroot}%{_bindir}/monetdb_mtest.sh
 rm -rf %{buildroot}%{_datadir}/monetdb # /cmake
 
diff --git a/NT/mksqlwxs.py b/NT/mksqlwxs.py
--- a/NT/mksqlwxs.py
+++ b/NT/mksqlwxs.py
@@ -191,9 +191,9 @@ def main():
 print(r'')
 print(r'  ')
 id = comp(features, id, 16,
-  [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: 
x.startswith('_') and x.endswith('.dll') and ('geom' not in x) and ('pyapi' not 
in x) and ('opt_sql_append' not in x) and ('run_' not in x) and 
('microbenchmark' not in x), os.listdir(os.path.join(sys.argv[3], 'lib', 
'monetdb5'])
+  [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: 
x.startswith('_') and x.endswith('.dll') and ('geom' not in x) and ('pyapi' not 
in x) and ('opt_sql_append' not in x) and ('run_' not in x) and 
('microbenchmark' not in x) and ('udf' not in x), 
os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5'])
 id = comp(debug, id, 16,
-  [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: 
x.startswith('_') and x.endswith('.pdb') and ('geom' not in x) and 
('opt_sql_append' not in x) and ('run_' not in x) and ('microbenchmark' not in 
x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5'])
+  [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: 
x.startswith('_') and x.endswith('.pdb') and ('geom' not in x) and 
('opt_sql_append' not in x) and ('run_' not in x) and ('microbenchmark' not in 
x) and ('udf' not in x), os.listdir(os.path.join(sys.argv[3], 'lib', 
'monetdb5'])
 id = comp(geom, id, 16,
   [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: 
x.startswith('_') and (x.endswith('.dll') or x.endswith('.pdb')) and ('geom' in 
x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5'])
 id = comp(pyapi3, id, 16,
diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -1,697 +1,694 @@
-stdout of test 'MAL-signatures` in directory 'clients` itself:
-
-#select * from sys.malfunctions() order by module, "function", address, 
signature, comment;
 % .%1, .%1,.%1,.%1,.%1 # table_name
 % module,  function,   signature,  address,comment # name
 % clob,clob,   clob,   clob,   clob # type
-% 12,  28, 313,42, 0 # length
-[ "aggr",  "all",  "command aggr.all(X_0:bat[:any_1]):any_1 ", 
"SQLall;",  ""  ]
-[ "aggr",  "allnotequal",  "pattern aggr.allnotequal(X_0:bat[:any_1], 
X_1:bat[:any_1]):bit ",  "SQLallnotequal;",  ""  ]
+% 12,  28, 313,42, 860 # length
+[ "aggr",  "all",  "command aggr.all(X_0:bat[:any_1]):any_1 ", 
"SQLall;",  "if all values in b are equal return this, else nil"]
+[ "aggr",  "allnotequal",  "pattern aggr.allnotequal(X_0:bat[:any_1], 
X_1:bat[:any_1]):bit ",  "SQLallnotequal;",  "if all values in r are 
not equal to l return true, else if r has nil nil else false"   ]
 [ "aggr",  "anyequal", "pattern aggr.anyequal(X_0:any_1, 
X_1:any_1):bit ", "CMDvarEQ;",""  ]
-[ "aggr",  "anyequal", "pattern aggr.anyequal(X_0:bat[:any_1], 
X_1:bat[:any_1]):bit ", "SQLanyequal;", ""  ]
-[ "aggr",  "avg",  "command aggr.avg(X_0:bat[:bte], X_1:bat[:oid], 
X_2:bat[:any_1]):bat[:dbl] ",   "AGGRavg13_dbl;",   ""  ]
-[ "aggr",  "avg",  "command aggr.avg(X_0:bat[:dbl], X_1:bat[:oid], 
X_2:bat[:any_1]):bat[:dbl] ",   "AGGRavg13_dbl;",   ""  ]
-[ "aggr",  "avg",  "command aggr.avg(X_0:bat[:flt], X_1:bat[:oid], 
X_2:bat[:any_1]):bat[:dbl] ",   "AGGRavg13_dbl;",   ""  ]
-[ "aggr",  "avg",  "command aggr.avg(X_0:bat[:int], X_1:bat[:oid], 
X_2:bat[:any_1]):bat[:dbl] ",   "AGGRavg13_dbl;",   ""  ]
-[ "aggr",  "avg",  "command 

MonetDB: string_imprints - Code cleanup

2021-08-31 Thread Panagiotis Koutsourakis
Changeset: 3ca155710c3f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3ca155710c3f
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Code cleanup


diffs (88 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -77,8 +77,8 @@
 /* Macros for accessing metadada of a strimp. These are recorded in the
  * first 8 bytes of the heap.
  */
-#define NPAIRS(d) ((d) >> 8) & 0xff
-#define HSIZE(d) ((d) >> 16) & 0x
+#define NPAIRS(d) (((d) >> 8) & 0xff)
+#define HSIZE(d) (((d) >> 16) & 0x)
 
 #undef UTF8STRINGS /* Not using utf8 for now */
 #ifdef UTF8STRINGS
@@ -371,9 +371,10 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE;
 
for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
-   *(h1 + i) = hpairs[i].psize;
-   memcpy(h2, hpairs[i].pbytes, hpairs[i].psize);
-   h2 += hpairs[i].psize;
+   uint8_t psize = hpairs[i].psize;
+   h1[i] = psize;
+   memcpy(h2, hpairs[i].pbytes, psize);
+   h2 += psize;
}
r->strimps_base = h2;
r->strimps.free = sz;
@@ -386,12 +387,20 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
 return b->tstrimps;
 }
 
+#define STRIMP_COMPLETE(b) \
+   b->tstrimps != NULL &&\
+   (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - 
b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount
+
 static bool
 BATcheckstrimps(BAT *b)
 {
bool ret;
lng t = GDKusec();
 
+   if (b == NULL)
+   return false;
+
+   assert(b->batCacheid > 0);
if (b->tstrimps == (Strimps *)1) {
assert(!GDKinmemory(b->theap->farmid));
MT_lock_set(>batIdxLock);
@@ -422,7 +431,7 @@ BATcheckstrimps(BAT *b)
&& (desc & 0xff) == STRIMP_VERSION
&& ((npairs = NPAIRS(desc)) == 32 
|| npairs == 64)
&& (hsize = HSIZE(desc)) >= 96 && 
hsize <= 640
-   && ((desc & ((uint64_t)0xff << 32)) 
>> 32) == 1
+   && ((desc >> 32) & 0xff) == 1 /* 
check the persistence byte */
&& fstat(fd, ) == 0
&& st.st_size >= (off_t) 
(hp->strimps.free = hp->strimps.size =
  /* 
descriptor */
@@ -460,12 +469,15 @@ BATcheckstrimps(BAT *b)
 * not null and the number of bitstrings is equal to the bat
 * count.
 */
-ret = b->tstrimps != NULL &&
-   (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base 
- b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount;
-   if (ret)
-   TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): 
already has strimps, waited " LLFMT " usec\n", ALGOBATPAR(b), GDKusec() - t);
+   assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t 
*)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount);
+   ret = STRIMP_COMPLETE(b);
+if (ret) {
+   TRC_DEBUG(ACCELERATOR,
+ "BATcheckstrimps(" ALGOBATFMT "): already has 
strimps, waited " LLFMT " usec\n",
+ ALGOBATPAR(b), GDKusec() - t);
+   }
 
-   return ret;
+return ret;
 }
 
 /* Filter a BAT b using a string q. Return the result as a candidate
@@ -582,7 +594,7 @@ BATstrimpsync(void *arg)
failed = "";
}
}
-   TRC_DEBUG(ACCELERATOR, "BATstrimpsync(%s): strimps 
persisted"
+   TRC_DEBUG(ACCELERATOR, "BATstrimpsync(%s): strimp 
persisted"
  " (" LLFMT " usec)%s\n",
  BATgetId(b), GDKusec() - t0, failed);
}
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Do not create strimps for small bats

2021-08-31 Thread Panagiotis Koutsourakis
Changeset: 2e33639ce402 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/2e33639ce402
Modified Files:
gdk/gdk_strimps.h
monetdb5/modules/mal/pcre.c
Branch: string_imprints
Log Message:

Do not create strimps for small bats


diffs (32 lines):

diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -15,6 +15,7 @@
 #define STRIMP_VERSION (uint64_t)1
 #define STRIMP_HISTSIZE 256*256
 #define STRIMP_HEADER_SIZE 64
+#define STRIMP_CREATION_THRESHOLD 5000 /* do not create strimp for "small" 
BATs */
 
 typedef struct {
uint8_t *pbytes;
diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1886,13 +1886,14 @@ PCRElikeselect(bat *ret, const bat *bid,
 
assert(ATOMstorage(b->ttype) == TYPE_str);
 
-   if (use_strimps) {
+   if (use_strimps && BATcount(b) >= STRIMP_CREATION_THRESHOLD) {
if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
BAT *tmp_s;
tmp_s = STRMPfilter(b, s, *pat);
-   if(s)
+   if (tmp_s && s) {
BBPunfix(s->batCacheid);
-   s = tmp_s;
+   s = tmp_s;
+   }
} /* If we cannot create the strimp just continue normally */
 
}
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Fix reading strimps from the disk

2021-08-31 Thread Panagiotis Koutsourakis
Changeset: 1d265612c715 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/1d265612c715
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix reading strimps from the disk


diffs (44 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -423,25 +423,29 @@ BATcheckstrimps(BAT *b)
uint64_t npairs;
uint64_t hsize;
/* Read the 8 byte long strimp
-* descriptor and make sure that
-* the number of pairs is either
-* 32 or 64.
+* descriptor.
+*
+* NPAIRS must be 64 in the
+* current implementation.
+*
+* HSIZE must be between 200 and
+* 584 (inclusive): 8 bytes the
+* descritor, 64 bytes the pair
+* sizes and n*64 bytes the
+* actual pairs where 2 <= n <=
+* 8.
 */
if (read(fd, , 8) == 8
&& (desc & 0xff) == STRIMP_VERSION
-   && ((npairs = NPAIRS(desc)) == 32 
|| npairs == 64)
-   && (hsize = HSIZE(desc)) >= 96 && 
hsize <= 640
+   && ((npairs = NPAIRS(desc)) == 64)
+   && (hsize = HSIZE(desc)) >= 200 && 
hsize <= 584
&& ((desc >> 32) & 0xff) == 1 /* 
check the persistence byte */
&& fstat(fd, ) == 0
&& st.st_size >= (off_t) 
(hp->strimps.free = hp->strimps.size =
- /* 
descriptor */
- 8 +
- /* header 
size (offsets + pairs) */
+ /* header 
size (desc + offsets + pairs) */
  hsize +
- /* 
padding to 4 or 8 byte boundary */
- 
hsize%(npairs/8) == (npairs/8)? 0 : (npairs/8+((npairs/8) - hsize%(npairs/8))) +
  /* 
bitmasks */
- 
BATcount(b)*(npairs/8))
+ 
BATcount(b)*sizeof(uint64_t))
&& HEAPload(>strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
hp->pairs_base = hp->sizes_base 
+ npairs; /* pairs just after the offsets */
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Persist strimp when it is fully created

2021-08-31 Thread Panagiotis Koutsourakis
Changeset: 7bd939f4ba52 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/7bd939f4ba52
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Persist strimp when it is fully created

A strimp is fully created when we have computed a bitstring for every
string in the bat.


diffs (32 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -621,8 +621,6 @@ persistStrimp(BAT *b)
TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT 
persisting strimp\n", ALGOBATPAR(b));
 }
 
-static ATOMIC_TYPE STRMPnthread = ATOMIC_VAR_INIT(0);
-
 /* Create */
 gdk_return
 STRMPcreate(BAT *b, BAT *s)
@@ -643,7 +641,6 @@ STRMPcreate(BAT *b, BAT *s)
return GDK_FAIL;
}
 
-   (void)ATOMIC_INC();
/* Disable this before merging to default */
 if (VIEWtparent(b)) {
pb = BBP_cache(VIEWtparent(b));
@@ -678,10 +675,8 @@ STRMPcreate(BAT *b, BAT *s)
MT_lock_unset(>batIdxLock);
 
/* The thread that reaches this point last needs to write the strimp to 
disk. */
-   (void)ATOMIC_DEC();
-   if (STRMPnthread == 0) {
+   if (STRIMP_COMPLETE(pb))
persistStrimp(pb);
-   }
 
TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", 
GDKusec()-t0);
return GDK_SUCCEED;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Use correct address to bitstrings

2021-08-31 Thread Panagiotis Koutsourakis
Changeset: 97a1a209934a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/97a1a209934a
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Use correct address to bitstrings

Specifically do not use the value of strimps.free because it is
updated by multiple threads.


diffs (12 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -655,7 +655,7 @@ STRMPcreate(BAT *b, BAT *s)
 if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) {
return GDK_FAIL;
}
-   dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + 
b->hseqbase*8);
+   dh = (uint64_t *)h->strimps_base + b->hseqbase;
 
ncand = canditer_init(, b, s);
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Add a GDK env var to enable use of st...

2021-08-26 Thread Panagiotis Koutsourakis
Changeset: c3fda7365a01 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/c3fda7365a01
Modified Files:
monetdb5/modules/mal/pcre.c
Branch: string_imprints
Log Message:

Add a GDK env var to enable use of strimps


diffs (12 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1873,7 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid,
str msg = MAL_SUCCEED;
char *ppat = NULL;
bool use_re = false, use_strcmp = false, empty = false;
-   bool use_strimps = true;
+   bool use_strimps = GDKgetenv("gdk_use_strimps");
 
if ((b = BATdescriptor(*bid)) == NULL) {
msg = createException(MAL, "algebra.likeselect", 
SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Fix mitosis bug in strimp filtering

2021-08-26 Thread Panagiotis Koutsourakis
Changeset: 49164109a169 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/49164109a169
Modified Files:
gdk/gdk_strimps.c
monetdb5/modules/mal/pcre.c
Branch: string_imprints
Log Message:

Fix mitosis bug in strimp filtering


diffs (70 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -472,7 +472,7 @@ STRMPfilter(BAT *b, BAT *s, char *q)
BAT *r = NULL;
BUN i, ncand;
uint64_t qbmask;
-   uint64_t *ptr;
+   uint64_t *bitstring_array;
Strimps *strmps;
oid x;
struct canditer ci;
@@ -491,7 +491,6 @@ STRMPfilter(BAT *b, BAT *s, char *q)
 
ncand = canditer_init(, b, s);
if (ncand == 0)
-   /* Is this correct? */
return BATdense(b->hseqbase, 0, 0);
r = COLnew(b->hseqbase, TYPE_oid, ncand, TRANSIENT);
if (r == NULL) {
@@ -504,14 +503,15 @@ STRMPfilter(BAT *b, BAT *s, char *q)
 * (see the macro isIgnored).
 */
qbmask = STRMPmakebitstring(q, strmps);
-   ptr = (uint64_t *)strmps->strimps_base;
+   bitstring_array = (uint64_t *)strmps->strimps_base;
 
for (i = 0; i < ncand; i++) {
-   x = canditer_next() - b->hseqbase;
-   if ((*(ptr + x) & qbmask) == qbmask) {
-   oid pos = x + b->hseqbase;
-   if (BUNappend(r, , false) != GDK_SUCCEED)
+   x = canditer_next();
+   if ((bitstring_array[x] & qbmask) == qbmask) {
+   if (BUNappend(r, , false) != GDK_SUCCEED) {
+   BBPunfix(r->batCacheid);
goto sfilter_fail;
+   }
}
}
 
diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1873,6 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid,
str msg = MAL_SUCCEED;
char *ppat = NULL;
bool use_re = false, use_strcmp = false, empty = false;
+   bool use_strimps = true;
 
if ((b = BATdescriptor(*bid)) == NULL) {
msg = createException(MAL, "algebra.likeselect", 
SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
@@ -1883,6 +1884,17 @@ PCRElikeselect(bat *ret, const bat *bid,
goto bailout;
}
 
+   if (use_strimps) {
+   if (STRMPcreate(b, NULL) == GDK_SUCCEED) {
+   BAT *tmp_s;
+   tmp_s = STRMPfilter(b, s, *pat);
+   if(s)
+   BBPunfix(s->batCacheid);
+   s = tmp_s;
+   } /* If we cannot create the strimp just continue normally */
+
+   }
+
assert(ATOMstorage(b->ttype) == TYPE_str);
if ((msg = choose_like_path(, _re, _strcmp, , pat, 
esc)) != MAL_SUCCEED)
goto bailout;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Remove unused code

2021-08-19 Thread Panagiotis Koutsourakis
Changeset: e2e6c0d4dbf4 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e2e6c0d4dbf4
Modified Files:
monetdb5/modules/mal/batExtensions.c
Branch: string_imprints
Log Message:

Remove unused code


diffs (76 lines):

diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -282,72 +282,6 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt
return MAL_SUCCEED;
 }
 
-#if 0
-/*
- * String imprints.
- */
-static str
-PATstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-   bat bid;
-   BAT *b;
-   size_t n;
-
-   (void)cntxt;
-   (void)mb;
-
-   // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED);
-   bid = *getArgReference_bat(stk, pci, 1);
-   if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
-
-   if (!STRMPndigrams(b, )) {
-   throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) 
OPERATION_FAILED);
-   }
-
-   *getArgReference_lng(stk, pci, 0) = n;
-
-   return MAL_SUCCEED;
-}
-
-static str
-PATstrimp_makehist(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
-{
-   bat bid;
-   BAT *b, *ob;
-   size_t i;
-   uint64_t hist[STRIMP_HISTSIZE];
-   size_t count;
-
-   (void)cntxt;
-   (void)mb;
-
-   bid = *getArgReference_bat(stk, pci, 2);
-   if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
-
-   if (!STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, )) {
-   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
-   }
-
-   ob = COLnew(0, TYPE_lng, STRIMP_HISTSIZE, TRANSIENT);
-   if (ob == NULL) {
-   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
-   }
-
-   for (i=0; i < STRIMP_HISTSIZE; i++) {
-   if (BUNappend(ob, hist + i, false) != GDK_SUCCEED)
-   throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
-   }
-
-   *getArgReference_lng(stk, pci, 0) = count;
-   *getArgReference_bat(stk, pci, 1) = ob->batCacheid;
-
-   BBPkeepref(ob->batCacheid);
-   return MAL_SUCCEED;
-}
-#endif
-
 #include "mel.h"
 mel_func batExtensions_init_funcs[] = {
  pattern("bat", "new", CMDBATnew, false, "", args(1,2, 
batargany("",1),argany("tt",1))),
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Scaffolding for strimp optimizer

2021-08-19 Thread Panagiotis Koutsourakis
Changeset: 8ea09480b1ea for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/8ea09480b1ea
Added Files:
monetdb5/optimizer/opt_strimps.c
monetdb5/optimizer/opt_strimps.h
Modified Files:
monetdb5/modules/mal/strimps.c
monetdb5/optimizer/CMakeLists.txt
monetdb5/optimizer/opt_pipes.c
monetdb5/optimizer/opt_prelude.c
monetdb5/optimizer/opt_prelude.h
monetdb5/optimizer/opt_wrapper.c
monetdb5/optimizer/optimizer.c
Branch: string_imprints
Log Message:

Scaffolding for strimp optimizer


diffs (242 lines):

diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c
--- a/monetdb5/modules/mal/strimps.c
+++ b/monetdb5/modules/mal/strimps.c
@@ -86,10 +86,10 @@ PATstrimpCreate(Client cntxt, MalBlkPtr 
 
bid = *getArgReference_bat(stk, pci, 1);
if ((b = BATdescriptor(bid)) == NULL)
-   throw(MAL, "strimps.strimpHeader", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
+   throw(MAL, "strimps.strimpCreate", SQLSTATE(HY002) 
RUNTIME_OBJECT_MISSING);
 
if(STRMPcreate(b) != GDK_SUCCEED)
-   throw(MAL, "strimps.strimpHistogram", SQLSTATE(HY002) 
OPERATION_FAILED);
+   throw(MAL, "strimps.strimpCreate", SQLSTATE(HY002) 
OPERATION_FAILED);
 
// *getArgReference_lng(stk, pci, 0) = 0;
return MAL_SUCCEED;
diff --git a/monetdb5/optimizer/CMakeLists.txt 
b/monetdb5/optimizer/CMakeLists.txt
--- a/monetdb5/optimizer/CMakeLists.txt
+++ b/monetdb5/optimizer/CMakeLists.txt
@@ -49,6 +49,7 @@ target_sources(optimizer
   opt_postfix.c opt_postfix.h
   opt_volcano.c opt_volcano.h
   opt_fastpath.c opt_fastpath.h
+  opt_strimps.c opt_strimps.h
   opt_wrapper.c
   PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}/opt_pipes.h)
diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c
--- a/monetdb5/optimizer/opt_pipes.c
+++ b/monetdb5/optimizer/opt_pipes.c
@@ -49,7 +49,20 @@ static struct PIPELINES {
 "optimizer.deadcode();"
 "optimizer.multiplex();"
 "optimizer.generator();"
-"optimizer.profiler();" 
+"optimizer.profiler();"
+//"optimizer.candidates();" only for decoration in explain
+//"optimizer.mask();"
+"optimizer.garbageCollector();",
+"stable", NULL, 1},
+   {"minimal_strimps_pipe",
+"optimizer.inline();"
+"optimizer.remap();"
+"optimizer.bincopyfrom();"
+"optimizer.strimps();"
+"optimizer.deadcode();"
+"optimizer.multiplex();"
+"optimizer.generator();"
+"optimizer.profiler();"
 //"optimizer.candidates();" only for decoration in explain
 //"optimizer.mask();"
 "optimizer.garbageCollector();",
diff --git a/monetdb5/optimizer/opt_prelude.c b/monetdb5/optimizer/opt_prelude.c
--- a/monetdb5/optimizer/opt_prelude.c
+++ b/monetdb5/optimizer/opt_prelude.c
@@ -188,6 +188,7 @@ const char *minusRef;
 const char *mirrorRef;
 const char *mitosisRef;
 const char *mkeyRef;
+const char *mkstrimpsRef;
 const char *mmathRef;
 const char *modRef;
 const char *mtimeRef;
@@ -279,6 +280,8 @@ const char *startRef;
 const char *starttraceRef;
 const char *stoptraceRef;
 const char *streamsRef;
+const char *strimpFilterSelectRef;
+const char *strimpsRef;
 const char *strRef;
 const char *subavgRef;
 const char *subcountRef;
@@ -495,6 +498,7 @@ void optimizerInit(void)
mirrorRef = putName("mirror");
mitosisRef = putName("mitosis");
mkeyRef = putName("mkey");
+   mkstrimpsRef = putName("mkstrimp");
mmathRef = putName("mmath");
modRef = putName("%");
mtimeRef = putName("mtime");
@@ -586,6 +590,7 @@ void optimizerInit(void)
starttraceRef = putName("starttrace");
stoptraceRef = putName("stoptrace");
streamsRef = putName("streams");
+   strimpsRef = putName("strimps");
strRef = putName("str");
subavgRef = putName("subavg");
subcountRef = putName("subcount");
diff --git a/monetdb5/optimizer/opt_prelude.h b/monetdb5/optimizer/opt_prelude.h
--- a/monetdb5/optimizer/opt_prelude.h
+++ b/monetdb5/optimizer/opt_prelude.h
@@ -186,6 +186,7 @@ mal_export  const char *minusRef;
 mal_export  const char *mirrorRef;
 mal_export  const char *mitosisRef;
 mal_export  const char *mkeyRef;
+mal_export  const char *mkstrimpsRef;
 mal_export  const char *mmathRef;
 mal_export  const char *modRef;
 mal_export  const char *mtimeRef;
@@ -278,6 +279,8 @@ mal_export  const char *startRef;
 mal_export  const char *starttraceRef;
 mal_export  const char *stoptraceRef;
 mal_export  const char *streamsRef;
+mal_export  const char *strimpFilterSelectRef;
+mal_export  const char *strimpsRef;
 mal_export  const char *strRef;
 mal_export  const char *subavgRef;
 mal_export  const char *subcountRef;
diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/optimizer/opt_strimps.c
@@ 

MonetDB: string_imprints - Add filtering to the plan (WIP)

2021-08-19 Thread Panagiotis Koutsourakis
Changeset: 2569ffa9872b for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/2569ffa9872b
Modified Files:
monetdb5/optimizer/opt_prelude.c
monetdb5/optimizer/opt_strimps.c
Branch: string_imprints
Log Message:

Add filtering to the plan (WIP)


diffs (63 lines):

diff --git a/monetdb5/optimizer/opt_prelude.c b/monetdb5/optimizer/opt_prelude.c
--- a/monetdb5/optimizer/opt_prelude.c
+++ b/monetdb5/optimizer/opt_prelude.c
@@ -591,6 +591,7 @@ void optimizerInit(void)
stoptraceRef = putName("stoptrace");
streamsRef = putName("streams");
strimpsRef = putName("strimps");
+   strimpFilterSelectRef = putName("strimpfilterselect");
strRef = putName("str");
subavgRef = putName("subavg");
subcountRef = putName("subcount");
diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c
--- a/monetdb5/optimizer/opt_strimps.c
+++ b/monetdb5/optimizer/opt_strimps.c
@@ -26,10 +26,13 @@ OPTstrimpsImplementation(Client cntxt, M
int i, limit;
// int mvcvar = -1;
int count=0;
-   InstrPtr p,q, *old = mb->stmt;
+   InstrPtr p,q,r, *old = mb->stmt;
char buf[256];
lng usec = GDKusec();
str msg = MAL_SUCCEED;
+   /* int res, nvar; */
+   /* ValRecord cst; */
+   int res;
 
(void) pci;
(void) cntxt;
@@ -53,10 +56,24 @@ OPTstrimpsImplementation(Client cntxt, M
pushInstruction(mb, q);
typeChecker(cntxt->usermodule, mb, q, mb->stop-1, TRUE);
 
+   /* cst.vtype = TYPE_bit; */
+   /* nvar = defConstant(mb, TYPE_bit, ); */
+   r = newInstruction(mb, strimpsRef, 
strimpFilterSelectRef);
+   res = newTmpVariable(mb, newBatType(TYPE_oid));
+   setDestVar(r, res);
+   r = addArgument(mb, r, getArg(p, 1));
+   r = addArgument(mb, r, getArg(p, 2));
+   r = addArgument(mb, r, getArg(p, 3));
+   r = addArgument(mb, r, getArg(p, 6));
+
+   pushInstruction(mb, r);
+   // typeChecker(cntxt->usermodule, mb, r, mb->stop-1, 
TRUE);
+
count++;
}
pushInstruction(mb, p);
}
+   GDKfree(old);
 
 /* Defense line against incorrect plans */
 if( count){
@@ -68,7 +85,7 @@ OPTstrimpsImplementation(Client cntxt, M
 }
 /* keep all actions taken as a post block comment */
usec = GDKusec()- usec;
-snprintf(buf,256,"%-20s actions=%2d time=" LLFMT " 
usec","volcano",count,usec);
+snprintf(buf,256,"%-20s actions=%2d time=" LLFMT " 
usec","strimps",count,usec);
 newComment(mb,buf);
if( count > 0)
addtoMalBlkHistory(mb);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Use BUNFMT to output BUN values

2021-09-09 Thread Panagiotis Koutsourakis
Changeset: c5f5568d25b1 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/c5f5568d25b1
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Use BUNFMT to output BUN values


diffs (14 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -556,8 +556,8 @@ STRMPfilter(BAT *b, BAT *s, char *q)
r->trevsorted = BATcount(r) <= 1;
r->tnil = false;
r->tnonil = true;
-   TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT
- " items took " LLFMT " usec. Keeping " LLFMT
+   TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " BUNFMT
+ " items took " LLFMT " usec. Keeping " BUNFMT
  " items (%.2f%%).\n", ncand, GDKusec()-t0, r->batCount,
  100*r->batCount/(double)ncand);
TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Fix some whitespace inconsistency

2021-09-09 Thread Panagiotis Koutsourakis
Changeset: 459f85f095b5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/459f85f095b5
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Fix some whitespace inconsistency


diffs (118 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -225,7 +225,7 @@ STRMPchoosePairs(PairHistogramElem *hist
if (max_counts[cmin_max] < hist[i].cnt) {
max_counts[cmin_max] = hist[i].cnt;
indices[cmin_max] = i;
-for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > 
max_counts[hidx-1]; hidx--) {
+   for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > 
max_counts[hidx-1]; hidx--) {
swp(max_counts, hidx, hidx-1, uint64_t);
swp(indices, hidx, hidx-1, size_t);
}
@@ -350,7 +350,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
if (b->tstrimps == NULL) {
MT_lock_set(>batIdxLock);
/* Make sure no other thread got here first */
-if (b->tstrimps == NULL &&
+   if (b->tstrimps == NULL &&
STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, 
put the result in hpairs */
sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the 
descriptor and the pair sizes */
for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
@@ -389,7 +389,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
}
MT_lock_unset(>batIdxLock);
}
-return b->tstrimps;
+   return b->tstrimps;
 }
 
 /* This macro takes a bat and checks if the strimp construction has been
@@ -480,20 +480,20 @@ BATcheckstrimps(BAT *b)
GDKclrerr();/* we're not currently interested in 
errors */
}
MT_lock_unset(>batIdxLock);
-}
+   }
/* The string imprint is initialized if the strimp pointer is
 * not null and the number of bitstrings is equal to the bat
 * count.
 */
assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t 
*)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount);
ret = STRIMP_COMPLETE(b);
-if (ret) {
+   if (ret) {
TRC_DEBUG(ACCELERATOR,
  "BATcheckstrimps(" ALGOBATFMT "): already has 
strimps, waited " LLFMT " usec\n",
  ALGOBATPAR(b), GDKusec() - t);
}
 
-return ret;
+   return ret;
 }
 
 /* Filter a BAT b using a string q. Return the result as a candidate
@@ -544,23 +544,23 @@ STRMPfilter(BAT *b, BAT *s, char *q)
for (i = 0; i < ncand; i++) {
x = canditer_next();
if ((bitstring_array[x] & qbmask) == qbmask) {
-  if (BUNappend(r, , false) != GDK_SUCCEED) {
-BBPunfix(r->batCacheid);
-goto sfilter_fail;
-  }
-}
-}
+   if (BUNappend(r, , false) != GDK_SUCCEED) {
+   BBPunfix(r->batCacheid);
+   goto sfilter_fail;
+   }
+   }
+   }
 
-r->tkey = true;
-r->tsorted = true;
-r->trevsorted = BATcount(r) <= 1;
-r->tnil = false;
-r->tnonil = true;
-TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT
+   r->tkey = true;
+   r->tsorted = true;
+   r->trevsorted = BATcount(r) <= 1;
+   r->tnil = false;
+   r->tnonil = true;
+   TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT
  " items took " LLFMT " usec. Keeping " LLFMT
  " items (%.2f%%).\n", ncand, GDKusec()-t0, r->batCount,
  100*r->batCount/(double)ncand);
-TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
+   TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
return virtualize(r);
 
  sfilter_fail:
@@ -659,7 +659,7 @@ STRMPcreate(BAT *b, BAT *s)
}
 
/* Disable this before merging to default */
-if (VIEWtparent(b)) {
+   if (VIEWtparent(b)) {
pb = BBP_cache(VIEWtparent(b));
assert(pb);
} else {
@@ -669,7 +669,7 @@ STRMPcreate(BAT *b, BAT *s)
if (BATcheckstrimps(pb))
return GDK_SUCCEED;
 
-if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) {
+   if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) {
return GDK_FAIL;
}
dh = (uint64_t *)h->strimps_base + b->hseqbase;
@@ -871,7 +871,7 @@ STRMPndigrams(BAT *b, size_t *n)
*n = 0;
for (i = 0; i < b->batCount; i++) {
s = (char *)BUNtail(bi, i);
-// *n += STRMP_strlen(s) - 1;
+   // *n += STRMP_strlen(s) - 1;
  

MonetDB: string_imprints - Merge with default

2021-09-13 Thread Panagiotis Koutsourakis
Changeset: 5d4525349513 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/5d4525349513
Modified Files:
gdk/CMakeLists.txt
gdk/gdk.h
gdk/gdk_bbp.c
gdk/gdk_private.h
sql/backends/monet5/CMakeLists.txt
sql/backends/monet5/sql.c
sql/scripts/CMakeLists.txt
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 510485 to 300 lines):

diff --git a/common/utils/matomic.h b/common/utils/matomic.h
--- a/common/utils/matomic.h
+++ b/common/utils/matomic.h
@@ -75,7 +75,7 @@ typedef volatile atomic_ulong ATOMIC_TYP
 
 #define ATOMIC_INIT(var, val)  atomic_init(var, (ATOMIC_BASE_TYPE) (val))
 #define ATOMIC_DESTROY(var)((void) 0)
-#define ATOMIC_GET(var)atomic_load(var)
+#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) 
atomic_load(var))
 #define ATOMIC_SET(var, val)   atomic_store(var, (ATOMIC_BASE_TYPE) (val))
 #define ATOMIC_XCG(var, val)   atomic_exchange(var, (ATOMIC_BASE_TYPE) (val))
 #define ATOMIC_CAS(var, exp, des)  atomic_compare_exchange_strong(var, 
exp, (ATOMIC_BASE_TYPE) (des))
@@ -134,7 +134,7 @@ typedef __declspec(align(8)) volatile AT
 #if SIZEOF_SIZE_T == 8
 
 #ifdef __INTEL_COMPILER
-#define ATOMIC_GET(var)_InterlockedExchangeAdd64(var, 
0)
+#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) 
_InterlockedExchangeAdd64(var, 0))
 #else
 #define ATOMIC_GET(var)(*(var))
 /* should we use _InterlockedExchangeAdd64(var, 0) instead? */
@@ -162,7 +162,7 @@ ATOMIC_CAS(ATOMIC_TYPE *var, ATOMIC_BASE
 #else
 
 #ifdef DECLSPEC_NOINITALL
-#define ATOMIC_GET(var)
_InlineInterlockedExchangeAdd64(var, 0)
+#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) 
_InlineInterlockedExchangeAdd64(var, 0))
 #define ATOMIC_SET(var, val)   _InlineInterlockedExchange64(var, 
(ATOMIC_BASE_TYPE) (val))
 #define ATOMIC_XCG(var, val)   _InlineInterlockedExchange64(var, 
(ATOMIC_BASE_TYPE) (val))
 #define ATOMIC_ADD(var, val)   _InlineInterlockedExchangeAdd64(var, 
(ATOMIC_BASE_TYPE) (val))
@@ -172,7 +172,7 @@ ATOMIC_CAS(ATOMIC_TYPE *var, ATOMIC_BASE
 #define ATOMIC_OR(var, val)_InlineInterlockedOr64(var, 
(ATOMIC_BASE_TYPE) (val))
 #define ATOMIC_AND(var, val)   _InlineInterlockedAnd64(var, (ATOMIC_BASE_TYPE) 
(val))
 #else
-#define ATOMIC_GET(var)_InterlockedExchangeAdd64(var, 
0)
+#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) 
_InterlockedExchangeAdd64(var, 0))
 #define ATOMIC_SET(var, val)   _InterlockedExchange64(var, (ATOMIC_BASE_TYPE) 
(val))
 #define ATOMIC_XCG(var, val)   _InterlockedExchange64(var, (ATOMIC_BASE_TYPE) 
(val))
 #define ATOMIC_ADD(var, val)   _InterlockedExchangeAdd64(var, 
(ATOMIC_BASE_TYPE) (val))
@@ -233,7 +233,7 @@ typedef volatile ATOMIC_BASE_TYPE ATOMIC
 #define ATOMIC_INIT(var, val)  (*(var) = (val))
 #define ATOMIC_DESTROY(var)((void) 0)
 
-#define ATOMIC_GET(var)__atomic_load_n(var, 
__ATOMIC_SEQ_CST)
+#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) 
__atomic_load_n(var, __ATOMIC_SEQ_CST))
 #define ATOMIC_SET(var, val)   __atomic_store_n(var, (ATOMIC_BASE_TYPE) (val), 
__ATOMIC_SEQ_CST)
 #define ATOMIC_XCG(var, val)   __atomic_exchange_n(var, (ATOMIC_BASE_TYPE) 
(val), __ATOMIC_SEQ_CST)
 #define ATOMIC_CAS(var, exp, des)  __atomic_compare_exchange_n(var, exp, 
(ATOMIC_BASE_TYPE) (des), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt
--- a/gdk/CMakeLists.txt
+++ b/gdk/CMakeLists.txt
@@ -18,6 +18,7 @@ set(gdk_public_headers
   $
   $
   $
+  $
   $
   $
   $
@@ -28,7 +29,8 @@ set(gdk_public_headers
   $
   $
   $
-  $)
+  $
+  $)
 
 add_library(bat SHARED)
 
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -863,9 +863,6 @@ mskGetVal(BAT *b, BUN p)
  *  HEAPcopy (Heap *dst,*src);
  * @item int
  * @tab
- *  HEAPdelete (Heap *dst, str o, str ext);
- * @item int
- * @tab
  *  HEAPwarm (Heap *h);
  * @end multitable
  *
diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c
--- a/gdk/gdk_align.c
+++ b/gdk/gdk_align.c
@@ -91,7 +91,7 @@ VIEWcreate(oid seq, BAT *b)
return BATdense(seq, b->tseqbase, b->batCount);
}
 
-   bn = BATcreatedesc(seq, b->ttype, false, TRANSIENT);
+   bn = BATcreatedesc(seq, b->ttype, false, TRANSIENT, 0);
if (bn == NULL)
return NULL;
assert(bn->theap == NULL);
@@ -193,8 +193,7 @@ BATmaterialize(BAT *b)
.parentid = b->batCacheid,
.dirty = true,
};
-   strconcat_len(tail->filename, sizeof(tail->filename),
- BBP_physical(b->batCacheid), ".tail", NULL);
+   settailname(tail, BBP_physical(b->batCacheid), TYPE_oid, 0);
if (HEAPalloc(tail, cnt, sizeof(oid), 0) != GDK_SUCCEED) {
GDKfree(tail);
return GDK_FAIL;

MonetDB: string_imprints - Remove assertion

2021-09-13 Thread Panagiotis Koutsourakis
Changeset: a257bb9341c6 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/a257bb9341c6
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Remove assertion


diffs (23 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -485,7 +485,7 @@ BATcheckstrimps(BAT *b)
 * not null and the number of bitstrings is equal to the bat
 * count.
 */
-   assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t 
*)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount);
+   // assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t 
*)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount);
ret = STRIMP_COMPLETE(b);
if (ret) {
TRC_DEBUG(ACCELERATOR,
@@ -692,8 +692,9 @@ STRMPcreate(BAT *b, BAT *s)
MT_lock_unset(>batIdxLock);
 
/* The thread that reaches this point last needs to write the strimp to 
disk. */
-   if (STRIMP_COMPLETE(pb))
+   if (STRIMP_COMPLETE(pb)) {
persistStrimp(pb);
+   }
 
TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", 
GDKusec()-t0);
return GDK_SUCCEED;
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Change gdk_use_strimps to yes-no vari...

2021-09-06 Thread Panagiotis Koutsourakis
Changeset: 9ee8a080cd94 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/9ee8a080cd94
Modified Files:
monetdb5/modules/mal/pcre.c
Branch: string_imprints
Log Message:

Change gdk_use_strimps to yes-no variable


diffs (12 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1873,7 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid,
str msg = MAL_SUCCEED;
char *ppat = NULL;
bool use_re = false, use_strcmp = false, empty = false;
-   bool use_strimps = GDKgetenv_int("gdk_use_strimps", 0);
+   bool use_strimps = GDKgetenv_isyes("gdk_use_strimps");
 
if ((b = BATdescriptor(*bid)) == NULL) {
msg = createException(MAL, "algebra.likeselect", 
SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - When filtering log how many elements ...

2021-09-06 Thread Panagiotis Koutsourakis
Changeset: 23861925c0dc for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/23861925c0dc
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

When filtering log how many elements remain


diffs (41 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -532,21 +532,23 @@ STRMPfilter(BAT *b, BAT *s, char *q)
for (i = 0; i < ncand; i++) {
x = canditer_next();
if ((bitstring_array[x] & qbmask) == qbmask) {
-   if (BUNappend(r, , false) != GDK_SUCCEED) {
-   BBPunfix(r->batCacheid);
-   goto sfilter_fail;
-   }
-   }
-   }
+  if (BUNappend(r, , false) != GDK_SUCCEED) {
+BBPunfix(r->batCacheid);
+goto sfilter_fail;
+  }
+}
+}
 
-   r->tkey = true;
-   r->tsorted = true;
-   r->trevsorted = BATcount(r) <= 1;
-   r->tnil = false;
-   r->tnonil = true;
-   TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT
- " items took " LLFMT " usec\n", ncand, GDKusec()-t0);
-   TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
+r->tkey = true;
+r->tsorted = true;
+r->trevsorted = BATcount(r) <= 1;
+r->tnil = false;
+r->tnonil = true;
+TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT
+ " items took " LLFMT " usec. Keeping " LLFMT
+ " items (%.2f%%).\n", ncand, GDKusec()-t0, r->batCount,
+ 100*r->batCount/(double)ncand);
+TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) );
return virtualize(r);
 
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Define a full pipeline using the stri...

2021-09-06 Thread Panagiotis Koutsourakis
Changeset: ca39849883db for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ca39849883db
Modified Files:
monetdb5/optimizer/opt_pipes.c
Branch: string_imprints
Log Message:

Define a full pipeline using the strimps optimizer

This is a copy of the default pipeline with the strimps optimizer
included but it unfortunatelly crashes the server.


diffs (45 lines):

diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c
--- a/monetdb5/optimizer/opt_pipes.c
+++ b/monetdb5/optimizer/opt_pipes.c
@@ -115,6 +115,41 @@ static struct PIPELINES {
 "optimizer.wlc();"
 "optimizer.garbageCollector();",
 "stable", NULL, 1},
+   {"strimps_pipe",
+"optimizer.inline();"
+"optimizer.remap();"
+"optimizer.costModel();"
+"optimizer.coercions();"
+"optimizer.aliases();"
+"optimizer.evaluate();"
+"optimizer.emptybind();"
+"optimizer.deadcode();" /* Feb2021 update, I pushed deadcode optimizer 
earlier in the pipeline so it runs before mitosis, thus removing less 
instructions */
+"optimizer.pushselect();"
+"optimizer.aliases();"
+"optimizer.mitosis();"
+"optimizer.mergetable();"
+"optimizer.bincopyfrom();"
+"optimizer.aliases();"
+"optimizer.constants();"
+"optimizer.commonTerms();"
+"optimizer.projectionpath();"
+"optimizer.deadcode();"
+"optimizer.matpack();"
+"optimizer.reorder();"
+"optimizer.dataflow();"
+"optimizer.querylog();"
+"optimizer.multiplex();"
+"optimizer.strimps();"
+"optimizer.generator();"
+"optimizer.profiler();"
+"optimizer.candidates();"
+//"optimizer.mask();"
+"optimizer.deadcode();"
+"optimizer.postfix();"
+//  "optimizer.jit();" awaiting the new batcalc api
+"optimizer.wlc();"
+"optimizer.garbageCollector();",
+"stable", NULL, 1},
{"default_fast",
 "optimizer.defaultfast()",
 "stable", NULL, 1},
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-09-06 Thread Panagiotis Koutsourakis
Changeset: f70db48d5bd9 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/f70db48d5bd9
Modified Files:
gdk/gdk.h
gdk/gdk_bbp.c
gdk/gdk_strimps.c
monetdb5/optimizer/opt_pipes.c
monetdb5/optimizer/opt_prelude.c
monetdb5/optimizer/opt_prelude.h
monetdb5/optimizer/opt_wrapper.c
monetdb5/optimizer/optimizer.c
sql/backends/monet5/sql.c
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 8029 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -9110,6 +9110,8 @@
 [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;",  
"", ""  ]
 [ "optimizer", "wlc",  "pattern optimizer.wlc():str ", "OPTwrapper;",  ""  
]
 [ "optimizer", "wlc",  "pattern optimizer.wlc(X_0:str, X_1:str):str ", 
"OPTwrapper;",  "Inject the workload capture-replay primitives" ]
+[ "optimizer", "wrapper",  "pattern optimizer.wrapper():str ", 
"OPTwrapper;",  ""  ]
+[ "optimizer", "wrapper",  "pattern optimizer.wrapper(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Fake optimizer"]
 [ "pcre",  "imatch",   "command pcre.imatch(X_0:str, X_1:str):bit ",   
"PCREimatch;",  "Caseless Perl Compatible Regular Expression pattern matching 
against a string" ]
 [ "pcre",  "index","command pcre.index(X_0:pcre, X_1:str):int ",   
"PCREindex;",   "match a pattern, return matched position (or 0 when not 
found)"]
 [ "pcre",  "match","command pcre.match(X_0:str, X_1:str):bit ",
"PCREmatch;",   "Perl Compatible Regular Expression pattern matching against a 
string"  ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128 
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -12410,6 +12410,8 @@
 [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;",  
"", ""  ]
 [ "optimizer", "wlc",  "pattern optimizer.wlc():str ", "OPTwrapper;",  ""  
]
 [ "optimizer", "wlc",  "pattern optimizer.wlc(X_0:str, X_1:str):str ", 
"OPTwrapper;",  "Inject the workload capture-replay primitives" ]
+[ "optimizer", "wrapper",  "pattern optimizer.wrapper():str ", 
"OPTwrapper;",  ""  ]
+[ "optimizer", "wrapper",  "pattern optimizer.wrapper(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Fake optimizer"]
 [ "pcre",  "imatch",   "command pcre.imatch(X_0:str, X_1:str):bit ",   
"PCREimatch;",  "Caseless Perl Compatible Regular Expression pattern matching 
against a string" ]
 [ "pcre",  "index","command pcre.index(X_0:pcre, X_1:str):int ",   
"PCREindex;",   "match a pattern, return matched position (or 0 when not 
found)"]
 [ "pcre",  "match","command pcre.match(X_0:str, X_1:str):bit ",
"PCREmatch;",   "Perl Compatible Regular Expression pattern matching against a 
string"  ]
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -773,7 +773,8 @@ void MPresetProfiler(stream *fdout);
 char *MSP_locate_sqlscript(const char *mod_name, bit recurse);
 str MSinitClientPrg(Client cntxt, const char *mod, const char *nme);
 void MSresetInstructions(MalBlkPtr mb, int start);
-void MSresetVariables(Client cntxt, MalBlkPtr mb, MalStkPtr glb, int start);
+void MSresetStack(Client cntxt, MalBlkPtr mb, MalStkPtr glb);
+void MSresetVariables(MalBlkPtr mb);
 void MSscheduleClient(str command, str challenge, bstream *fin, stream *fout, 
protocol_version protocol, size_t blocksize);
 str MSserveClient(Client cntxt);
 str OIDXcreateImplementation(Client cntxt, int tpe, BAT *b, int pieces);
@@ -1137,6 +1138,7 @@ const char *mdbRef;
 void mdbSetBreakRequest(Client cntxt, MalBlkPtr mb, str request, char cmd);
 const char *mergecandRef;
 const char *mergepackRef;
+const char *mergetableRef;
 const char *minRef;
 const char *min_no_nilRef;
 const char *minusRef;
@@ -1227,7 +1229,6 @@ const char *plusRef;
 const char *postludeRef;
 const char *preludeRef;
 MalStkPtr prepareMALstack(MalBlkPtr mb, int size);
-int prepareMalBlk(MalBlkPtr mb, str s);
 void printFunction(stream *fd, MalBlkPtr mb, MalStkPtr stk, int flg);
 void printInstruction(stream *fd, MalBlkPtr mb, MalStkPtr stk, InstrPtr p, int 
flg);
 const char *printRef;
@@ -1386,6 +1387,7 @@ const char *thetajoinRef;
 const char *thetaselectRef;
 const char *tidRef;
 const char *timestampRef;
+const char *totalRef;
 void traceFunction(component_t comp, MalBlkPtr mb, MalStkPtr stk, int flg);
 void traceInstruction(component_t comp, MalBlkPtr mb, MalStkPtr stk, InstrPtr 
p, int flg);
 const char *transactionRef;
diff --git a/common/stream/fwf.c b/common/stream/fwf.c
--- a/common/stream/fwf.c

MonetDB: string_imprints - Fix the strimp optimizer pipe

2021-09-06 Thread Panagiotis Koutsourakis
Changeset: 636a2b626cfa for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/636a2b626cfa
Modified Files:
monetdb5/optimizer/opt_pipes.c
monetdb5/optimizer/opt_strimps.c
Branch: string_imprints
Log Message:

Fix the strimp optimizer pipe


diffs (91 lines):

diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c
--- a/monetdb5/optimizer/opt_pipes.c
+++ b/monetdb5/optimizer/opt_pipes.c
@@ -58,6 +58,8 @@ static struct PIPELINES {
 "optimizer.inline();"
 "optimizer.remap();"
 "optimizer.bincopyfrom();"
+"optimizer.aliases();"
+"optimizer.constants();"
 "optimizer.deadcode();"
 "optimizer.multiplex();"
 "optimizer.strimps();"
diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c
--- a/monetdb5/optimizer/opt_strimps.c
+++ b/monetdb5/optimizer/opt_strimps.c
@@ -23,14 +23,13 @@
 str
 OPTstrimpsImplementation(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr 
pci)
 {
-   int i, limit, slimit, needed =0, actions=0;
+   int i, limit, slimit, actions=0;
+   bool needed = false;
// int mvcvar = -1;
InstrPtr p, q, *old = mb->stmt;
char buf[256];
lng usec = GDKusec();
str msg = MAL_SUCCEED;
-   /* int res, nvar; */
-   /* ValRecord cst; */
int res;
 
(void) pci;
@@ -44,8 +43,11 @@ OPTstrimpsImplementation(Client cntxt, M
 
for(i=0; i < limit; i++) {
p = old[i];
-   if (getModuleId(p) == algebraRef && getFunctionId(p) == 
likeselectRef)
-   needed = 1;
+   if (getModuleId(p) == algebraRef &&
+   getFunctionId(p) == likeselectRef) {
+   needed = true;
+   break;
+   }
}
 
if (!needed)
@@ -65,31 +67,31 @@ OPTstrimpsImplementation(Client cntxt, M
 
/* Look for bind operations on strings, because for those we 
migh need strimps */
 
-   if (getModuleId(p) == algebraRef && getFunctionId(p) == 
likeselectRef) {
-
-   /* cst.vtype = TYPE_bit; */
-   /* nvar = defConstant(mb, TYPE_bit, ); */
+   if (getModuleId(p) == algebraRef &&
+   getFunctionId(p) == likeselectRef) {
q = newInstruction(mb, strimpsRef, 
strimpFilterSelectRef);
res = newTmpVariable(mb, newBatType(TYPE_oid));
setDestVar(q, res);
-   q = addArgument(mb, q, getArg(p, 1));
-   q = addArgument(mb, q, getArg(p, 2));
-   q = addArgument(mb, q, getArg(p, 3));
-   q = addArgument(mb, q, getArg(p, 6));
+   q = pushArgument(mb, q, getArg(p, 1));
+   q = pushArgument(mb, q, getArg(p, 2));
+   q = pushArgument(mb, q, getArg(p, 3));
+   q = pushArgument(mb, q, getArg(p, 6));
 
pushInstruction(mb, q);
-   typeChecker(cntxt->usermodule, mb, q, mb->stop-1, TRUE);
+   typeChecker(cntxt->usermodule, mb, q, mb->stop - 1, 
TRUE);
 
-   p = setArgument(mb, p, 2, getArg(q, 0));
+   getArg(p, 2) = res;
+   // setArgument(mb, p, 2, res);
 
actions++;
+   /* continue; */
}
pushInstruction(mb, p);
}
(void)slimit;
-   /* for (; i < slimit; i++) */
-   /*  if (old[i]) */
-   /*  freeInstruction(old[i]); */
+   for (; i < slimit; i++)
+   if (old[i])
+   freeInstruction(old[i]);
GDKfree(old);
 
 /* Defense line against incorrect plans */
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Update comment

2021-09-06 Thread Panagiotis Koutsourakis
Changeset: 63aecf69eb6a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/63aecf69eb6a
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Update comment


diffs (173 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -12,11 +12,10 @@
  * A string imprint is an index that can be used as a prefilter in LIKE
  * queries. It has 2 components:
  *
- * - a header of 32 or 64 string element pairs.
+ * - a header of 64 string element pairs.
  *
- * - a 32 or 64 bit mask for each string in the BAT that encodes the
- *   presence or absence of each element of the header in the specific
- *   item.
+ * - a 64 bit mask for each string in the BAT that encodes the presence
+ *   or absence of each element of the header in the specific item.
  *
  * A string imprint is stored in a new Heap in the BAT, aligned in 8
  * byte (64 bit) words.
@@ -24,40 +23,45 @@
  * The first 64 bit word, the header descriptor, describes how the
  * header of the strimp is encoded. The least significant byte (v in the
  * schematic below) is the version number. The second (np) is the number
- * of pairs in the header. The next 2 bytes (hs) is the size of the
- * header in bytes. Finally the fifth byte is the persistence byte. The
- * last 3 bytes needed to align to the 8 byte boundary should be zero,
- * and are reserved for future use.
+ * of pairs in the header. In the current implementation this is always
+ * 64. The next 2 bytes (hs) is the total size of the header in
+ * bytes. Finally the fifth byte is the persistence byte. The last 3
+ * bytes needed to align to the 8 byte boundary should be zero, and are
+ * reserved for future use.
  *
  * The following np bytes are the sizes of the pairs. These can have
  * values from 2 to 8 and are the number of bytes that the corresponding
  * pair takes up. Following that there are the bytes encoding the actual
  * pairs.
  *
- * |   v   |  np   |  hs  |   p   |  reserved  |  8bytes
- * |   | 
---
- * Strimp Header  |
- * | psz_0 | psz_1 | ...   |  |
- * |   |  --- |
- * |   |np bytes  |
- * |   ... | psz_n |  ---  hs 
bytes
- * | pair_0  |   pair_1|  |
- * |...|  |
- * | pair_k-1   |   pair_k |  |
- * |  pair_n   |  |
- * |   | 
---
+ * | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte |
+ * |---|
+ * |   v   |  np   |  hs   |   p   |  reserved |  8bytes   
  ---
+ * |---|  ___  
   |
+ * | psz_0 | psz_1 | ...   |   |   
   |
+ * |   |   |   
   |
+ * |   |np bytes   
   |
+ * |   |   |   
   |
+ * |   ... | psz_n |   |   
hs bytes
+ * |---|  ___  
   |
+ * | pair_0| pair_1|   
   |
+ * |  ...  |   
   |
+ * | pair_k-1   |   pair_k |   
   |
+ * |  pair_n   |   
   |
+ * |---|   
  ---
  *
  *
- * The bitmasks for each string in the BAT follow after this.
+ * The bitmasks for each string in the BAT follow after this, aligned to
+ * the string BAT.
  *
  * Strimp creation goes as follows:
  *
  * - Construct a histogram of the element (byte or character) pairs for
  *   all the strings in the BAT.
  *
- * - Take the 32/64 most frequent pairs as the Strimp Header.
+ * - Take the 64 most frequent pairs as the Strimp Header.
  *
- * - For each string in the bat construct a 32/64 bit mask that encodes
+ * - For each string in the bat construct a 64 bit mask that encodes
  *   the presence or absence of each member of the header in the string.
  */
 
@@ -80,8 +84,8 @@
 #define NPAIRS(d) (((d) >> 8) & 0xff)
 #define HSIZE(d) (((d) >> 16) & 0x)
 
-#undef UTF8STRINGS 

MonetDB: string_imprints - Merge with default

2021-08-05 Thread Panagiotis Koutsourakis
ct most users not to notice this change, as
+  such schema changes aren't usually done concurrently.
+
+* Tue Jul 20 2021 Sjoerd Mullender  - 11.41.1-20210723
+- clients: The MonetDB stethoscope has been removed.  There is now a separate
+  package available with PIP (monetdb_stethoscope) or as an RPM or DEB
+  package (stethoscope) from the monetdb.org repository.
+
+* Tue Jul 20 2021 Sjoerd Mullender  - 11.41.1-20210723
+- gdk: A new type, called msk, was introduced.  This is a bit mask type.
+  In a bat with type msk, each row occupies a single bit, so 8 rows are
+  stored in a single byte.  There is no NULL value for this type.
+- gdk: The function of the BAT iterator (type BATiter, function bat_iterator)
+  has been expanded.  The iterator now contains more information about
+  the BAT, and it contains a pointer to the heaps (theap and tvheap)
+  that are stable, at least in the sense that they will remain available
+  even when parallel threads update the BAT and cause those heaps to grow
+  (and therefore possibly move in memory).  A call to bat_iterator must
+  now be accompanied by a call to bat_iterator_end.
+
+* Mon Jun  7 2021 Sjoerd Mullender  - 11.41.1-20210723
+- monetdb5: When using the --in-memory option, mserver5 will run completely in
+  memory, i.e. not create a database on disk.  The server can still be
+  connected to using the name of the in-memory database.  This name is
+  "in-memory".
+
+* Tue May 11 2021 Sjoerd Mullender  - 11.41.1-20210723
+- sql: There is now a function sys.current_sessionid() to return the session
+  ID of the current session.  This ID corresponds with the sessionid in
+  the sys.queue() result.
+
+* Mon May 10 2021 Panagiotis Koutsourakis  - 
11.41.1-20210723
+- merovingian: Deprecate `profilerstart` and `profilerstop` commands. Since
+  stethoscope is a separate project 
(https://github.com/MonetDBSolutions/monetdb-pystethoscope)
+  the installation directory is not standard anymore. `profilerstart` and
+  `profilerstop` commands assume that the stethoscope executable is in the
+  same directory as `mserver5`. This is no longer necessarily true since
+  stethoscope can now be installed in a python virtual environment. The
+  commands still work if stethoscope is installed using the official
+  MonetDB installers, or if a symbolic link is created in the directory
+  where `mserver5` is located.
+
+* Fri May  7 2021 Sjoerd Mullender  - 11.41.1-20210723
+- odbc: A typo that made the SQLSpecialColumns function unusable was fixed.
+
+* Mon May  3 2021 Pedro Ferreira  - 
11.41.1-20210723
+- sql: Merge statements could not produce correct results on complex join
+  conditions, so a renovation was made. As a consequence, subqueries
+  now have to be disabled on merge join conditions.
+
+* Mon May  3 2021 svetlin  - 
11.41.1-20210723
+- sql: preserve in-query comments
+
+* Mon May  3 2021 Sjoerd Mullender  - 11.41.1-20210723
+- merovingian: The exittimeout value can now be set to a negative value (e.g. 
-1) to
+  indicate that when stopping the dbfarm (using monetdbd stop dbfarm),
+  any mserver5 processes are to be sent a termination signal and then
+  waited for until they terminate.  In addition, if exittimeout is greater
+  than zero, the mserver5 processes are sent a SIGKILL signal after the
+  specified timeout and the managing monetdbd is sent a SIGKILL signal
+  after another five seconds (if it didn't terminate already).  The old
+  situation was that the managing monetdbd process was sent a SIGKILL
+  after 30 seconds, and the mserver5 processes that hadn't terminated
+  yet would be allowed to continue their termination sequence.
+
+* Mon May  3 2021 Sjoerd Mullender  - 11.41.1-20210723
+- gdk: Implemented function BUNreplacemultiincr to replace multiple values
+  in a BAT in one go, starting at a given position.
+- gdk: Implemented new function BUNreplacemulti to replace multiple values
+  in a BAT in one go, at the given positions.
+- gdk: Removed function BUNinplace, just use BUNreplace, and check whether
+  the BAT argument is of type TYPE_void before calling if you don't
+  want to materialize.
+
+* Mon May  3 2021 Pedro Ferreira  - 
11.41.1-20210723
+- sql: Use of CTEs inside UPDATE and DELETE statements are now more
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Make sure the strimp is created befor...

2021-08-05 Thread Panagiotis Koutsourakis
Changeset: 87cd6fa29635 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/87cd6fa29635
Modified Files:
monetdb5/modules/mal/strimps.c
Branch: string_imprints
Log Message:

Make sure the strimp is created before filtering


diffs (12 lines):

diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c
--- a/monetdb5/modules/mal/strimps.c
+++ b/monetdb5/modules/mal/strimps.c
@@ -124,6 +124,8 @@ PATstrimpFilterSelect(Client cntxt, MalB
 
assert(s->ttype == TYPE_void);
 
+   if(STRMPcreate(b) != GDK_SUCCEED)
+   throw(MAL, "strimps.strimpfilter", SQLSTATE(HY002) "strimp 
creation failed");
 
pat = *getArgReference_str(stk, pci, 3);
if ((ob = STRMPfilter(b, pat)) == NULL) {
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Only one thread should compute the he...

2021-08-05 Thread Panagiotis Koutsourakis
Changeset: 8671d66745fb for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/8671d66745fb
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Only one thread should compute the header and allocate the heap


diffs (261 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -216,7 +216,7 @@ STRMPchoosePairs(PairHistogramElem *hist
const size_t cmin_max = STRIMP_HEADER_SIZE - 1;
size_t hidx;
 
-   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+   TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
 
for(i = 0; i < hist_size; i++) {
if (max_counts[cmin_max] < hist[i].cnt) {
@@ -234,7 +234,7 @@ STRMPchoosePairs(PairHistogramElem *hist
cp[i].psize = hist[indices[i]].p->psize;
}
 
-   TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
+   TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
 }
 
 static bool
@@ -249,7 +249,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
PairIterator pi, *pip;
CharPair cp, *cpp;
 
-   TRC_DEBUG_IF(ALGO) t0 = GDKusec();
+   TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
hlen = STRIMP_HISTSIZE;
if ((hist = (PairHistogramElem 
*)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) {
// TODO handle error
@@ -317,7 +317,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair
}
GDKfree(hist);
 
-   TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
+   TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0);
return true;
 }
 
@@ -333,39 +333,48 @@ STRMPcreateStrimpHeap(BAT *b)
CharPair hpairs[STRIMP_HEADER_SIZE];
const char *nme;
 
-
-   STRMPbuildHeader(b, hpairs);  /* Find the header pairs */
-   sz = 8 + STRIMP_HEADER_SIZE;  /* add 8-bytes for the descriptor */
-   for(i = 0; i < STRIMP_HEADER_SIZE; i++) {
-   sz += hpairs[i].psize;
-   }
+   if (b->tstrimps == NULL) {
+   MT_lock_set(>batIdxLock);
+   /* Make sure no other thread got here first */
+if (b->tstrimps == NULL) {
+   STRMPbuildHeader(b, hpairs); /* Find the header pairs */
+   sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the 
descriptor */
+   for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
+   sz += hpairs[i].psize;
+   }
 
-   nme = GDKinmemory(b->theap->farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
-   /* Allocate the strimps heap */
-   if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
-   (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, 
strimpheap)) < 0 ||
-   strconcat_len(r->strimps.filename, sizeof(r->strimps.filename),
- nme, ".tstrimps", NULL) >= 
sizeof(r->strimps.filename) ||
-   HEAPalloc(>strimps, BATcount(b)*sizeof(uint64_t) + sz, 
sizeof(uint8_t), 0) != GDK_SUCCEED) {
-   GDKfree(r);
-   return NULL;
+   nme = GDKinmemory(b->theap->farmid) ? ":memory:" : 
BBP_physical(b->batCacheid);
+   /* Allocate the strimps heap */
+   if ((r = GDKzalloc(sizeof(Strimps))) == NULL ||
+   (r->strimps.farmid = BBPselectfarm(b->batRole, 
b->ttype, strimpheap)) < 0 ||
+   strconcat_len(r->strimps.filename, 
sizeof(r->strimps.filename), nme,
+ ".tstrimps", NULL) >= 
sizeof(r->strimps.filename) ||
+   HEAPalloc(>strimps, BATcount(b) * 
sizeof(uint64_t) + sz, sizeof(uint8_t), 0) != GDK_SUCCEED) {
+   GDKfree(r);
+   MT_lock_unset(>batIdxLock);
+   return NULL;
+   }
+
+   descriptor = STRIMP_VERSION | 
((uint64_t)STRIMP_HEADER_SIZE) << 8 | ((uint64_t)sz) << 16;
+
+   ((uint64_t *)r->strimps.base)[0] = descriptor;
+   r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8;
+   r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE;
+
+   for (i = 0; i < STRIMP_HEADER_SIZE; i++) {
+   *(h1 + i) = hpairs[i].psize;
+   memcpy(h2, hpairs[i].pbytes, hpairs[i].psize);
+   h2 += hpairs[i].psize;
+   }
+   r->strimps_base = h2;
+   r->strimps.free = sz;
+
+   b->tstrimps = r;
+   b->batDirtydesc = true;
+   }
+   MT_lock_unset(>batIdxLock);
}
-
-   descriptor =  STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) << 8 | 
((uint64_t)sz) << 16;
-
-   ((uint64_t *)r->strimps.base)[0] = descriptor;
-   r->sizes_base = h1 = (uint8_t *)r->strimps.base 

MonetDB: string_imprints - Merge with default

2021-09-20 Thread Panagiotis Koutsourakis
Changeset: 39e30b8b6392 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/39e30b8b6392
Modified Files:
gdk/gdk_bbp.c
gdk/gdk_private.h
monetdb5/modules/mal/batcalc.c
monetdb5/modules/mal/pcre.c
sql/backends/monet5/sql.c
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 9006 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -718,29 +718,29 @@
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:any_1], X_1:bat[:any_1], 
X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ",   "CMDbatNE;",""
  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:any_1], X_1:bat[:any_1], 
X_2:bit):bat[:bit] ", "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], 
X_1:bat[:int]):bat[:bit] ",  "CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], 
X_1:bat[:int]):bat[:bit] ",  "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], 
X_1:bat[:lng]):bat[:bit] ",  "CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], 
X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ",   "CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], 
X_2:bit):bat[:bit] ", "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], 
X_1:bat[:lng]):bat[:bit] ",  "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], 
X_1:bat[:sht]):bat[:bit] ",  "CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], 
X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ",   "CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], 
X_2:bit):bat[:bit] ", "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], 
X_1:bat[:sht]):bat[:bit] ",  "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], 
X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:int):bat[:bit] 
","CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], 
X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ",   "CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], 
X_2:bit):bat[:bit] ", "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:int):bat[:bit] 
","CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:int, 
X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:int, 
X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:lng):bat[:bit] 
","CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:int, 
X_2:bat[:oid], X_3:bit):bat[:bit] ","CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:int, 
X_2:bit):bat[:bit] ",   "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:lng):bat[:bit] 
","CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, 
X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, 
X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;",""  ]
-[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:sht):bat[:bit] 
","CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, 
X_2:bat[:oid], X_3:bit):bat[:bit] ","CMDbatNE;",""  ]
+[ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, 
X_2:bit):bat[:bit] ",   "CMDbatNE;",""  ]
 [ "batcalc",   "!=",   "pattern batcalc.!=(X_0:bat[:bte], X_1:sht):bat[:bit] 
","CMDbatNE;",""  ]
 [ "batcalc", 

MonetDB: string_imprints - Avoid dividing in favor of multiplying

2021-09-20 Thread Panagiotis Koutsourakis
Changeset: ecf3c1a4555f for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ecf3c1a4555f
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Avoid dividing in favor of multiplying


diffs (20 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -400,7 +400,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
 #define STRIMP_COMPLETE(b) \
b->tstrimps != NULL &&  \
(b->tstrimps == (Strimps *)1 || \
-(b->tstrimps->strimps.free - ((char 
*)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base))/sizeof(uint64_t) 
== b->batCount)
+(b->tstrimps->strimps.free - ((char 
*)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base)) == 
b->batCount*sizeof(uint64_t))
 
 static bool
 BATcheckstrimps(BAT *b)
@@ -485,7 +485,6 @@ BATcheckstrimps(BAT *b)
 * not null and the number of bitstrings is equal to the bat
 * count.
 */
-   // assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t 
*)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount);
ret = STRIMP_COMPLETE(b);
if (ret) {
TRC_DEBUG(ACCELERATOR,
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Name pointer more appropriatelly

2021-09-20 Thread Panagiotis Koutsourakis
Changeset: d34debd8ca7e for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d34debd8ca7e
Modified Files:
gdk/gdk_private.h
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Name pointer more appropriatelly


diffs (62 lines):

diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -418,8 +418,8 @@ struct Strimps {
Heap strimps;
uint8_t *sizes_base;/* pointer into strimps heap (pair sizes)  */
uint8_t *pairs_base;/* pointer into strimps heap (pairs start)   */
-   void *strimps_base; /* pointer into strimps heap (strimps start) */
-   /* strimps_base is a pointer to either a uint32_t or a uint64_t */
+   void *bitstrings_base;  /* pointer into strimps heap (bitstrings start) 
*/
+   /* bitstrings_base is a pointer to uint64_t */
 };
 
 typedef struct {
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -381,7 +381,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
memcpy(h2, hpairs[i].pbytes, psize);
h2 += psize;
}
-   r->strimps_base = h2;
+   r->bitstrings_base = h2;
r->strimps.free = sz;
 
b->tstrimps = r;
@@ -400,7 +400,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
 #define STRIMP_COMPLETE(b) \
b->tstrimps != NULL &&  \
(b->tstrimps == (Strimps *)1 || \
-(b->tstrimps->strimps.free - ((char 
*)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == 
b->batCount)
+(b->tstrimps->strimps.free - ((char 
*)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base))/sizeof(uint64_t) 
== b->batCount)
 
 static bool
 BATcheckstrimps(BAT *b)
@@ -461,7 +461,7 @@ BATcheckstrimps(BAT *b)
&& HEAPload(>strimps, nme, 
"tstrimps", false) == GDK_SUCCEED) {
hp->sizes_base = (uint8_t 
*)hp->strimps.base + 8; /* sizes just after the descriptor */
hp->pairs_base = hp->sizes_base 
+ npairs; /* pairs just after the offsets */
-   hp->strimps_base = 
hp->strimps.base + hsize;/* bitmasks just after the pairs */
+   hp->bitstrings_base = 
hp->strimps.base + hsize;/* bitmasks just after the pairs */
 
close(fd);
hp->strimps.parentid = 
b->batCacheid;
@@ -539,7 +539,7 @@ STRMPfilter(BAT *b, BAT *s, char *q)
 * (see the macro isIgnored).
 */
qbmask = STRMPmakebitstring(q, strmps);
-   bitstring_array = (uint64_t *)strmps->strimps_base;
+   bitstring_array = (uint64_t *)strmps->bitstrings_base;
 
for (i = 0; i < ncand; i++) {
x = canditer_next();
@@ -672,7 +672,7 @@ STRMPcreate(BAT *b, BAT *s)
if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) {
return GDK_FAIL;
}
-   dh = (uint64_t *)h->strimps_base + b->hseqbase;
+   dh = (uint64_t *)h->bitstrings_base + b->hseqbase;
 
ncand = canditer_init(, b, s);
 
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge heads

2021-09-22 Thread Panagiotis Koutsourakis
Changeset: b38e5d23af12 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/b38e5d23af12
Modified Files:
sql/test/emptydb/Tests/check.stable.out
sql/test/emptydb/Tests/check.stable.out.32bit
sql/test/emptydb/Tests/check.stable.out.int128
Branch: string_imprints
Log Message:

Merge heads


diffs (truncated from 358 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -9123,6 +9123,7 @@
 [ "optimizer", "mergetable",   "pattern optimizer.mergetable(X_0:str, 
X_1:str):str ",  "OPTwrapper;",  "Resolve the multi-table definitions"   ]
 [ "optimizer", "minimal_fast", "function optimizer.minimal_fast():void;",  
"", ""  ]
 [ "optimizer", "minimal_pipe", "function optimizer.minimal_pipe():void;",  
"", ""  ]
+[ "optimizer", "minimal_strimps_pipe", "function 
optimizer.minimal_strimps_pipe():void;",  "", ""  ]
 [ "optimizer", "minimalfast",  "pattern optimizer.minimalfast():str ", 
"OPTwrapper;",  ""  ]
 [ "optimizer", "minimalfast",  "pattern optimizer.minimalfast(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Fast compound minimal optimizer pipe"  ]
 [ "optimizer", "mitosis",  "pattern optimizer.mitosis():str ", 
"OPTwrapper;",  ""  ]
@@ -9156,6 +9157,9 @@
 [ "optimizer", "reorder",  "pattern optimizer.reorder():str ", 
"OPTwrapper;",  ""  ]
 [ "optimizer", "reorder",  "pattern optimizer.reorder(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Reorder by dataflow dependencies"  ]
 [ "optimizer", "sequential_pipe",  "function 
optimizer.sequential_pipe():void;",   "", ""  ]
+[ "optimizer", "strimps",  "pattern optimizer.strimps():str ", 
"OPTwrapper;",  ""  ]
+[ "optimizer", "strimps",  "pattern optimizer.strimps(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Use strimps index if appropriate"  ]
+[ "optimizer", "strimps_pipe", "function optimizer.strimps_pipe():void;",  
"", ""  ]
 [ "optimizer", "volcano",  "pattern optimizer.volcano():str ", 
"OPTwrapper;",  ""  ]
 [ "optimizer", "volcano",  "pattern optimizer.volcano(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Simulate volcano style execution"  ]
 [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;",  
"", ""  ]
@@ -9305,6 +9309,7 @@
 [ "sql",   "covariancep",  "pattern sql.covariancep(X_0:lng, X_1:lng, 
X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ",  "SQLcovar_pop;",
"return the covariance population value of groups"  ]
 [ "sql",   "covariancep",  "pattern sql.covariancep(X_0:sht, X_1:sht, 
X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ",  "SQLcovar_pop;",
"return the covariance population value of groups"  ]
 [ "sql",   "createorderindex", "unsafe pattern 
sql.createorderindex(X_0:str, X_1:str, X_2:str):void ", 
"sql_createorderindex;","Instantiate the order index on a column"   
]
+[ "sql",   "createstrimps","unsafe pattern 
sql.createstrimps(X_0:str, X_1:str, X_2:str):void ","sql_createstrimps;",   
"Instantiate the strimps index on a column" ]
 [ "sql",   "cume_dist","pattern sql.cume_dist(X_0:any_1, X_1:bit, 
X_2:bit):dbl ",  "SQLcume_dist;","return the accumulated 
distribution of the number of rows per group to the total number of partition 
rows" ]
 [ "sql",   "current_time", "pattern sql.current_time():daytime ",  
"SQLcurrent_daytime;",  "Get the clients current daytime"   ]
 [ "sql",   "current_timestamp","pattern 
sql.current_timestamp():timestamp ",   "SQLcurrent_timestamp;","Get the 
clients current timestamp" ]
@@ -9592,6 +9597,10 @@
 [ "streams",   "readStr",  "unsafe command 
streams.readStr(X_0:streams):str ", "mnstr_read_stringwrap;",   "read 
string data from the stream"  ]
 [ "streams",   "writeInt", "unsafe command streams.writeInt(X_0:streams, 
X_1:int):void ",  "mnstr_writeIntwrap;",  "write data on the stream"  ]
 [ "streams",   "writeStr", "unsafe command streams.writeStr(X_0:streams, 
X_1:str):void ",  "mnstr_write_stringwrap;",  "write data on the stream"
  ]
+[ "strimps",   "mkstrimp", "pattern strimps.mkstrimp(X_0:bat[:str], 
X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT"]
+[ "strimps",   "strimpfilter", "pattern strimps.strimpfilter(X_0:str, 
X_1:str):bit ",  "PATstrimpFilter;", ""  ]
+[ "strimps",   "strimpfilterjoin", "pattern 
strimps.strimpfilterjoin(X_0:str, X_1:any, X_2:any, X_3:any, X_4:any, X_5:any) 
(X_6:bat[:oid], X_7:bat[:str]) ",   "PATstrimpFilter;", ""  ]
+[ "strimps",   "strimpfilterselect",   "pattern 
strimps.strimpfilterselect(X_0:bat[:str], X_1:bat[:oid], X_2:str, 
X_3:bit):bat[:oid] ","PATstrimpFilterSelect;",   ""  ]
 [ 

MonetDB: string_imprints - Merge with default

2021-09-22 Thread Panagiotis Koutsourakis
Changeset: 3fc32e309cdc for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3fc32e309cdc
Modified Files:
gdk/gdk_bbp.c
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 2202 to 300 lines):

diff --git a/CMakeLists.txt b/CMakeLists.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,8 +53,7 @@ test_big_endian(IS_BIG_ENDIAN)
 include(monetdb-functions)
 include(monetdb-findpackages)
 include(monetdb-toolchain)
-monetdb_default_toolchain()
-#monetdb_default_compiler_options()
+monetdb_default_compiler_options()
 
 include(monetdb-defines)
 monetdb_hg_revision()
diff --git a/clients/mapiclient/dump.c b/clients/mapiclient/dump.c
--- a/clients/mapiclient/dump.c
+++ b/clients/mapiclient/dump.c
@@ -382,15 +382,16 @@ dump_foreign_keys(Mapi mid, const char *
if (tname != NULL) {
char *s = sescape(schema);
char *t = sescape(tname);
+   if (s == NULL || t == NULL) {
+   free(s);
+   free(t);
+   goto bailout;
+   }
maxquerylen = 1024 + strlen(t) + strlen(s);
query = malloc(maxquerylen);
-   if (s == NULL || t == NULL || query == NULL) {
-   if (s)
-   free(s);
-   if (t)
-   free(t);
-   if (query)
-   free(query);
+   if (query == NULL) {
+   free(s);
+   free(t);
goto bailout;
}
snprintf(query, maxquerylen,
diff --git a/clients/odbc/driver/ODBCUtil.c b/clients/odbc/driver/ODBCUtil.c
--- a/clients/odbc/driver/ODBCUtil.c
+++ b/clients/odbc/driver/ODBCUtil.c
@@ -1255,6 +1255,7 @@ ODBCTranslateSQL(ODBCDbc *dbc, const SQL
strncpy(q, nquery, pr);
for (r = func->repl; *r; r++) {
if (*r == '\1' || *r == 
'\2' || *r == '\3' || *r == '\4') {
+   assert(*r <= 
func->nargs);
if (args[*r - 
1].argstart[0] == '\'')
q[pr++] 
= 'r';
strncpy(q + pr, 
args[*r - 1].argstart, args[*r - 1].arglen);
diff --git a/clients/odbc/driver/SQLTables.c b/clients/odbc/driver/SQLTables.c
--- a/clients/odbc/driver/SQLTables.c
+++ b/clients/odbc/driver/SQLTables.c
@@ -82,6 +82,8 @@ MNDBTables(ODBCStmt *stmt,
  "cast(null as varchar(1)) as remarks "
   "from sys.env() e "
   "where e.name = 'gdk_dbname'");
+   if (query == NULL)
+   goto nomem;
} else if (NameLength1 == 0 &&
   NameLength3 == 0 &&
   SchemaName &&
@@ -96,6 +98,8 @@ MNDBTables(ODBCStmt *stmt,
* schema remarks */
  "cast(null as varchar(1)) as remarks "
   "from sys.schemas order by table_schem");
+   if (query == NULL)
+   goto nomem;
} else if (NameLength1 == 0 &&
   NameLength2 == 0 &&
   NameLength3 == 0 &&
@@ -108,6 +112,8 @@ MNDBTables(ODBCStmt *stmt,
  "table_type_name as table_type, "
  "cast(null as varchar(1)) as remarks "
   "from sys.table_types order by table_type");
+   if (query == NULL)
+   goto nomem;
} else {
/* no special case argument values */
size_t querylen;
@@ -241,8 +247,6 @@ MNDBTables(ODBCStmt *stmt,
free(sch);
if (tab)
free(tab);
-   if (query)
-   free(query);
/* Memory allocation error */
addStmtError(stmt, "HY001", NULL, 0);
return SQL_ERROR;
diff --git a/cmake/monetdb-functions.cmake b/cmake/monetdb-functions.cmake
--- a/cmake/monetdb-functions.cmake
+++ b/cmake/monetdb-functions.cmake
@@ -9,73 +9,60 @@
 function(monetdb_hg_revision)
   # Get the current version control revision
   if(EXISTS "${CMAKE_SOURCE_DIR}/.hg_archival.txt")
-execute_process(COMMAND "sed" "-n" "s/^node: 
\\([0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]\\).*/\\1/p"
 ".hg_archival.txt" WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" RESULT_VARIABLE 
HG_RETURN_CODE
-  OUTPUT_VARIABLE HG_OUPUT_RES OUTPUT_STRIP_TRAILING_WHITESPACE)
-if(HG_RETURN_CODE EQUAL 0 AND HG_OUPUT_RES)
-  set(MERCURIAL_ID 

MonetDB: Jan2022 - Merge heads

2022-01-03 Thread Panagiotis Koutsourakis
Changeset: 10686cbf3739 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/10686cbf3739
Branch: Jan2022
Log Message:

Merge heads


diffs (truncated from 716 to 300 lines):

diff --git a/MonetDB.spec b/MonetDB.spec
--- a/MonetDB.spec
+++ b/MonetDB.spec
@@ -616,7 +616,11 @@ This package contains files needed to de
 
 %files SQL-server5-devel
 %defattr(-,root,root)
+%{_includedir}/monetdb/exception_buffer.h
+%{_includedir}/monetdb/opt_backend.h
+%{_includedir}/monetdb/rel_*.h
 %{_includedir}/monetdb/sql*.h
+%{_includedir}/monetdb/store_*.h
 
 %package embedded
 Summary: MonetDB as an embedded library
diff --git a/NT/mksqlwxs.py b/NT/mksqlwxs.py
--- a/NT/mksqlwxs.py
+++ b/NT/mksqlwxs.py
@@ -179,7 +179,7 @@ def main():
 print(r'')
 print(r'  ')
 id = comp(extend, id, 16,
-  sorted([r'include\monetdb\{}'.format(x) for x in filter(lambda 
x: (x.startswith('gdk') or x.startswith('monet') or x.startswith('mal') or 
x.startswith('sql')) and x.endswith('.h'), os.listdir(os.path.join(sys.argv[3], 
'include', 'monetdb')))] +
+  sorted([r'include\monetdb\{}'.format(x) for x in filter(lambda 
x: (x.startswith('gdk') or x.startswith('monet') or x.startswith('mal') or 
x.startswith('sql') or x.startswith('rel') or x.startswith('store') or 
x.startswith('exception') or x.startswith('opt_backend')) and x.endswith('.h'), 
os.listdir(os.path.join(sys.argv[3], 'include', 'monetdb')))] +
  [r'include\monetdb\copybinary.h',
   r'include\monetdb\mapi.h',
   r'include\monetdb\matomic.h',
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -1433,6 +1433,27 @@ const char *wlrRef;
 Workingset workingset[THREADS];
 const char *zero_or_oneRef;
 
+# monetdbe
+char *monetdbe_append(monetdbe_database dbhdl, const char *schema, const char 
*table, monetdbe_column **input, size_t column_count);
+char *monetdbe_bind(monetdbe_statement *stmt, void *data, size_t parameter_nr);
+char *monetdbe_cleanup_result(monetdbe_database dbhdl, monetdbe_result 
*result);
+char *monetdbe_cleanup_statement(monetdbe_database dbhdl, monetdbe_statement 
*stmt);
+int monetdbe_close(monetdbe_database db);
+char *monetdbe_dump_database(monetdbe_database dbhdl, const char *backupfile);
+char *monetdbe_dump_table(monetdbe_database dbhdl, const char *schema_name, 
const char *table_name, const char *backupfile);
+char *monetdbe_error(monetdbe_database db);
+char *monetdbe_execute(monetdbe_statement *stmt, monetdbe_result **result, 
monetdbe_cnt *affected_rows);
+char *monetdbe_get_autocommit(monetdbe_database dbhdl, int *result);
+char *monetdbe_get_columns(monetdbe_database dbhdl, const char *schema_name, 
const char *table_name, size_t *column_count, monetdbe_column **columns);
+int monetdbe_in_transaction(monetdbe_database dbhdl);
+const void *monetdbe_null(monetdbe_database dbhdl, monetdbe_types t);
+int monetdbe_open(monetdbe_database *db, char *url, monetdbe_options *opts);
+char *monetdbe_prepare(monetdbe_database dbhdl, char *query, 
monetdbe_statement **stmt, monetdbe_result **result);
+char *monetdbe_query(monetdbe_database dbhdl, char *query, monetdbe_result 
**result, monetdbe_cnt *affected_rows);
+char *monetdbe_result_fetch(monetdbe_result *mres, monetdbe_column **res, 
size_t column_index);
+char *monetdbe_set_autocommit(monetdbe_database dbhdl, int value);
+const char *monetdbe_version(void);
+
 # stream
 stream *block_stream(stream *s);
 stream *block_stream2(stream *s, size_t bufsiz, compression_method comp);
diff --git a/debian/monetdb5-sql-dev.install b/debian/monetdb5-sql-dev.install
--- a/debian/monetdb5-sql-dev.install
+++ b/debian/monetdb5-sql-dev.install
@@ -1,1 +1,5 @@
+debian/tmp/usr/include/monetdb/exception_buffer.h usr/include/monetdb
+debian/tmp/usr/include/monetdb/opt_backend.h usr/include/monetdb
+debian/tmp/usr/include/monetdb/rel_*.h usr/include/monetdb
 debian/tmp/usr/include/monetdb/sql*.h usr/include/monetdb
+debian/tmp/usr/include/monetdb/store_*.h usr/include/monetdb
diff --git 
a/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128
 
b/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128
--- 
a/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128
+++ 
b/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128
@@ -5200,6 +5200,7 @@ drop view sys.dump_create_roles;
 drop view sys.describe_functions;
 drop view sys.describe_partition_tables;
 drop view sys.describe_privileges;
+drop view sys.fully_qualified_functions;
 drop view sys.describe_comments;
 drop view sys.describe_tables;
 drop view sys.describe_sequences;
@@ -5280,6 +5281,28 @@ CREATE VIEW sys.describe_tables AS
AND s.id = t.schema_id
AND ts.table_type_id = t.type
AND s.name <> 

MonetDB: Jan2022 - Add changelog entry

2022-01-03 Thread Panagiotis Koutsourakis
Changeset: d5cf7d95ecf8 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/d5cf7d95ecf8
Modified Files:
gdk/ChangeLog.Jan2022
Branch: Jan2022
Log Message:

Add changelog entry


diffs (22 lines):

diff --git a/gdk/ChangeLog.Jan2022 b/gdk/ChangeLog.Jan2022
--- a/gdk/ChangeLog.Jan2022
+++ b/gdk/ChangeLog.Jan2022
@@ -1,6 +1,18 @@
 # ChangeLog file for GDK
 # This file is updated with Maddlog
 
+* Mon Jan  3 2022 Panagiotis Koutsourakis 
+- Implement string imprints (strimps for short) a pre-filter structure
+  for strings in order to accelerate LIKE queries. If a strimp exists
+  for a specific string column the strings are pre-filtered, rejecting
+  strings that cannot possibly match, before the more expensive and
+  accurate matching algorithms run. Strimps are created automatically
+  or using 'sys.strimp_create' with arguments the names of the schema,
+  table and column. Automatic strimp creation is controlled by two
+  user settable gdk options: 'gdk_use_strimps' (default value "no") and
+  'gdk_strimps_threshold' (default value 1.000.000). See the manual for
+  more details.
+
 * Wed Aug 11 2021 Sjoerd Mullender 
 - Many (most) low level functions that could take a long time (such as
   BATjoin) can now be aborted with a timeout.  When the function takes too
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: Jan2022 - Make strimps opt-in

2022-01-03 Thread Panagiotis Koutsourakis
Changeset: 4c84a9be8cc8 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/4c84a9be8cc8
Modified Files:
monetdb5/modules/mal/pcre.c
Branch: Jan2022
Log Message:

Make strimps opt-in

The user needs to specify --set gdk_use_strimps=yes in order to enable strimp 
creation.


diffs (12 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1870,7 +1870,7 @@ PCRElikeselect(bat *ret, const bat *bid,
str msg = MAL_SUCCEED;
char *ppat = NULL;
bool use_re = false, use_strcmp = false, empty = false;
-   bool use_strimps = !GDKgetenv_istext("gdk_use_strimps", "no"), 
with_strimps = false;
+   bool use_strimps = GDKgetenv_isyes("gdk_use_strimps"), with_strimps = 
false;
 
if ((b = BATdescriptor(*bid)) == NULL) {
msg = createException(MAL, "algebra.likeselect", 
SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: Jan2022 - Make the strimp creation threshold user defined

2022-01-03 Thread Panagiotis Koutsourakis
Changeset: a5a8ed8f7f73 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/a5a8ed8f7f73
Modified Files:
monetdb5/modules/mal/pcre.c
Branch: Jan2022
Log Message:

Make the strimp creation threshold user defined

The parameter gdk_strimps_threshold specifies how many entries the string bat
should have before a strimp is created. The defalt value is 1.000.000.


diffs (20 lines):

diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -1871,6 +1871,7 @@ PCRElikeselect(bat *ret, const bat *bid,
char *ppat = NULL;
bool use_re = false, use_strcmp = false, empty = false;
bool use_strimps = GDKgetenv_isyes("gdk_use_strimps"), with_strimps = 
false;
+   BUN strimp_creation_threshold = GDKgetenv_int("gdk_strimps_threshold", 
100);
 
if ((b = BATdescriptor(*bid)) == NULL) {
msg = createException(MAL, "algebra.likeselect", 
SQLSTATE(HY002) RUNTIME_OBJECT_MISSING);
@@ -1893,7 +1894,7 @@ PCRElikeselect(bat *ret, const bat *bid,
 * A better solution is to run the PCRElikeselect as a LIKE query with
 * strimps and return the complement of the result.
 */
-   if (!empty && use_strimps && BATcount(b) >= STRIMP_CREATION_THRESHOLD 
&& !*anti) {
+   if (!empty && use_strimps && BATcount(b) >= strimp_creation_threshold 
&& !*anti) {
BAT *tmp_s = NULL;
if (STRMPcreate(b, NULL) == GDK_SUCCEED && (tmp_s = 
STRMPfilter(b, s, *pat))) {
if (s)
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Add function to append bitstring to a...

2021-11-15 Thread Panagiotis Koutsourakis
Changeset: 78f1dd084b9a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/78f1dd084b9a
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
Branch: string_imprints
Log Message:

Add function to append bitstring to a strimp


diffs (69 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -716,6 +716,7 @@ STRMPcreate(BAT *b, BAT *s) {
}
bat_iterator_end();
 
+   r->strimps.free += ncand*sizeof(uint64_t);
pb->tstrimps = r;
pb->batDirtydesc = true;
persistStrimp(pb);
@@ -726,6 +727,48 @@ STRMPcreate(BAT *b, BAT *s) {
return GDK_SUCCEED;
 }
 
+gdk_return
+STRMPappendBitstring(BAT *b, const str s) {
+   lng t0 = 0;
+   BAT *pb;
+
+   TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
+   if (ATOMstorage(b->ttype) != TYPE_str) {
+   GDKerror("Cannot manipulate strimps index for non string 
bats\n");
+   return GDK_FAIL;
+   }
+
+   if (VIEWtparent(b)) {
+   pb = BBP_cache(VIEWtparent(b));
+   assert(pb);
+   } else {
+   pb = b;
+   }
+
+   if (!BATcheckstrimps(pb)) {
+   GDKerror("Strimp missing, cannot append value\n");
+   return GDK_FAIL;
+   }
+   MT_lock_set(>batIdxLock);
+   // Check that there is space in the heap
+   if (pb->tstrimps->strimps.free < pb->tstrimps->strimps.size + 
sizeof(uint64_t)) {
+   pb->tstrimps->strimps.base[pb->tstrimps->strimps.free] = 
STRMPmakebitstring(s, pb->tstrimps);
+   pb->tstrimps->strimps.free += sizeof(uint64_t);
+   }
+   else {
+   // TODO reallocate buffer
+   }
+
+   // TODO increase reconstruction counter if
+   // reconstruction counter is larger than a threshold
+   // recompute the strimp from scratch.
+
+   MT_lock_unset(>batIdxLock);
+
+   TRC_DEBUG(ACCELERATOR, "appending to strimp took " LLFMT " usec\n", 
GDKusec()-t0);
+   return GDK_SUCCEED;
+}
+
 /* Parallel creation. does not wok*/
 #if 0
 /* Creates the heap for a string imprint. Returns NULL on failure. This
diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h
--- a/gdk/gdk_strimps.h
+++ b/gdk/gdk_strimps.h
@@ -45,4 +45,5 @@ typedef struct {
 // gdk_export gdk_return STRMP_make_header(BAT *b);
 gdk_export gdk_return STRMPcreate(BAT *b, BAT *s);
 gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const str q);
+gdk_export gdk_return STRMPappendBitstring(BAT *b, const str s);
 #endif /* _GDK_STRIMPS_H_ */
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge heads

2021-11-15 Thread Panagiotis Koutsourakis
Changeset: 2f528186e330 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/2f528186e330
Branch: string_imprints
Log Message:

Merge heads


diffs (294 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -9132,7 +9132,6 @@
 [ "optimizer", "mergetable",   "pattern optimizer.mergetable(X_0:str, 
X_1:str):str ",  "OPTwrapper;",  "Resolve the multi-table definitions"   ]
 [ "optimizer", "minimal_fast", "function optimizer.minimal_fast():void;",  
"", ""  ]
 [ "optimizer", "minimal_pipe", "function optimizer.minimal_pipe():void;",  
"", ""  ]
-[ "optimizer", "minimal_strimps_pipe", "function 
optimizer.minimal_strimps_pipe():void;",  "", ""  ]
 [ "optimizer", "minimalfast",  "pattern optimizer.minimalfast():str ", 
"OPTwrapper;",  ""  ]
 [ "optimizer", "minimalfast",  "pattern optimizer.minimalfast(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Fast compound minimal optimizer pipe"  ]
 [ "optimizer", "mitosis",  "pattern optimizer.mitosis():str ", 
"OPTwrapper;",  ""  ]
@@ -9168,7 +9167,6 @@
 [ "optimizer", "sequential_pipe",  "function 
optimizer.sequential_pipe():void;",   "", ""  ]
 [ "optimizer", "strimps",  "pattern optimizer.strimps():str ", 
"OPTwrapper;",  ""  ]
 [ "optimizer", "strimps",  "pattern optimizer.strimps(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Use strimps index if appropriate"  ]
-[ "optimizer", "strimps_pipe", "function optimizer.strimps_pipe():void;",  
"", ""  ]
 [ "optimizer", "volcano",  "pattern optimizer.volcano():str ", 
"OPTwrapper;",  ""  ]
 [ "optimizer", "volcano",  "pattern optimizer.volcano(X_0:str, 
X_1:str):str ", "OPTwrapper;",  "Simulate volcano style execution"  ]
 [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;",  
"", ""  ]
@@ -9609,7 +9607,7 @@
 [ "streams",   "readStr",  "unsafe command 
streams.readStr(X_0:streams):str ", "mnstr_read_stringwrap;",   "read 
string data from the stream"  ]
 [ "streams",   "writeInt", "unsafe command streams.writeInt(X_0:streams, 
X_1:int):void ",  "mnstr_writeIntwrap;",  "write data on the stream"  ]
 [ "streams",   "writeStr", "unsafe command streams.writeStr(X_0:streams, 
X_1:str):void ",  "mnstr_write_stringwrap;",  "write data on the stream"
  ]
-[ "strimps",   "mkstrimp", "pattern strimps.mkstrimp(X_0:bat[:str], 
X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT"]
+[ "strimps",   "mkstrimp", "unsafe pattern strimps.mkstrimp(X_0:bat[:str], 
X_1:bat[:oid]):void ",  "PATstrimpCreate;", "construct the strimp a BAT"
]
 [ "strimps",   "strimpfilter", "pattern strimps.strimpfilter(X_0:str, 
X_1:str):bit ",  "PATstrimpFilter;", ""  ]
 [ "strimps",   "strimpfilterjoin", "pattern 
strimps.strimpfilterjoin(X_0:str, X_1:any, X_2:any, X_3:any, X_4:any, X_5:any) 
(X_6:bat[:oid], X_7:bat[:str]) ",   "PATstrimpFilter;", ""  ]
 [ "strimps",   "strimpfilterselect",   "pattern 
strimps.strimpfilterselect(X_0:bat[:str], X_1:bat[:oid], X_2:str, 
X_3:bit):bat[:oid] ","PATstrimpFilterSelect;",   ""  ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128 
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -12928,7 +12928,7 @@
 [ "streams",   "readStr",  "unsafe command 
streams.readStr(X_0:streams):str ", "mnstr_read_stringwrap;",   "read 
string data from the stream"  ]
 [ "streams",   "writeInt", "unsafe command streams.writeInt(X_0:streams, 
X_1:int):void ",  "mnstr_writeIntwrap;",  "write data on the stream"  ]
 [ "streams",   "writeStr", "unsafe command streams.writeStr(X_0:streams, 
X_1:str):void ",  "mnstr_write_stringwrap;",  "write data on the stream"
  ]
-[ "strimps",   "mkstrimp", "pattern strimps.mkstrimp(X_0:bat[:str], 
X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT"]
+[ "strimps",   "mkstrimp", "unsafe pattern strimps.mkstrimp(X_0:bat[:str], 
X_1:bat[:oid]):void ",  "PATstrimpCreate;", "construct the strimp a BAT"
]
 [ "strimps",   "strimpfilter", "pattern strimps.strimpfilter(X_0:str, 
X_1:str):bit ",  "PATstrimpFilter;", ""  ]
 [ "strimps",   "strimpfilterjoin", "pattern 
strimps.strimpfilterjoin(X_0:str, X_1:any, X_2:any, X_3:any, X_4:any, X_5:any) 
(X_6:bat[:oid], X_7:bat[:str]) ",   "PATstrimpFilter;", ""  ]
 [ "strimps",   "strimpfilterselect",   "pattern 
strimps.strimpfilterselect(X_0:bat[:str], X_1:bat[:oid], X_2:str, 
X_3:bit):bat[:oid] ","PATstrimpFilterSelect;",   ""  ]
diff --git a/sql/test/emptydb/Tests/check.stable.out 
b/sql/test/emptydb/Tests/check.stable.out
--- 

MonetDB: string_imprints - Add tests specific to strimps

2021-11-15 Thread Panagiotis Koutsourakis
Changeset: 34589d226cb2 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/34589d226cb2
Added Files:
sql/test/strimps/Tests/All
sql/test/strimps/Tests/persisted_strimp.SQL.py
sql/test/strimps/Tests/strimps_stable_counts.SQL.py
Branch: string_imprints
Log Message:

Add tests specific to strimps


diffs (129 lines):

diff --git a/sql/test/strimps/Tests/All b/sql/test/strimps/Tests/All
new file mode 100644
--- /dev/null
+++ b/sql/test/strimps/Tests/All
@@ -0,0 +1,2 @@
+strimps_stable_counts
+persisted_strimp
diff --git a/sql/test/strimps/Tests/persisted_strimp.SQL.py 
b/sql/test/strimps/Tests/persisted_strimp.SQL.py
new file mode 100644
--- /dev/null
+++ b/sql/test/strimps/Tests/persisted_strimp.SQL.py
@@ -0,0 +1,57 @@
+import os
+import socket
+import tempfile
+
+try:
+from MonetDBtesting import process
+except ImportError:
+import process
+from MonetDBtesting.sqltest import SQLTestCase
+
+COUNT_QUERY = "SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';"
+
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.bind(('', 0))
+port = sock.getsockname()[1]
+sock.close()
+
+# Make sure that reading a persisted strimp from disk gives correct
+# results.
+
+with tempfile.TemporaryDirectory() as farm_dir:
+fdir = os.path.join(farm_dir, 'db1')
+os.mkdir(fdir)
+with process.server(mapiport=port, dbname='db1',
+args=["--set", "gdk_use_strimps=yes",],
+dbfarm=fdir,
+stdin=process.PIPE,
+stdout=process.PIPE,
+stderr=process.PIPE) as s:
+with SQLTestCase() as mdb:
+mdb.connect(database='db1', port=port, username='monetdb', 
password='monetdb')
+mdb.execute("""CREATE TABLE orders (
+  o_orderkey   BIGINT NOT NULL,
+  o_custkeyINTEGER NOT NULL,
+  o_orderstatusCHAR(1) NOT NULL,
+  o_totalprice DECIMAL(15,2) NOT NULL,
+  o_orderdate  DATE NOT NULL,
+  o_orderpriority  CHAR(15) NOT NULL,
+  o_clerk  CHAR(15) NOT NULL,
+  o_shippriority   INTEGER NOT NULL,
+  o_commentVARCHAR(79) NOT 
NULL);""").assertSucceeded()
+mdb.execute("""COPY 15000 RECORDS INTO orders from 
r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS 
'|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded()
+mdb.execute("""COPY 15000 RECORDS INTO orders from 
r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS 
'|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded()
+mdb.execute("""COPY 15000 RECORDS INTO orders from 
r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS 
'|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded()
+mdb.execute("""COPY 15000 RECORDS INTO orders from 
r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS 
'|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded()
+mdb.execute("SELECT COUNT(*) FROM orders WHERE o_comment LIKE 
'%%slyly%%';").assertSucceeded().assertDataResultMatch([(12896,)])
+s.communicate()
+
+with process.server(mapiport=port, dbname='db1',
+args=["--set", "gdk_use_strimps=yes",],
+dbfarm=fdir,
+stdin=process.PIPE, stdout=process.PIPE, 
stderr=process.PIPE) as s:
+with SQLTestCase() as mdb:
+mdb.connect(database='db1', port=port, username='monetdb', 
password='monetdb')
+mdb.execute("SELECT COUNT(*) FROM orders WHERE o_comment LIKE 
'%%slyly%%';").assertSucceeded().assertDataResultMatch([(12896,)])
+mdb.execute("SELECT COUNT(*) FROM orders WHERE o_comment LIKE 
'%%slyly%%';").assertSucceeded().assertDataResultMatch([(12896,)])
+s.communicate()
diff --git a/sql/test/strimps/Tests/strimps_stable_counts.SQL.py 
b/sql/test/strimps/Tests/strimps_stable_counts.SQL.py
new file mode 100644
--- /dev/null
+++ b/sql/test/strimps/Tests/strimps_stable_counts.SQL.py
@@ -0,0 +1,55 @@
+import os
+import socket
+import tempfile
+
+try:
+from MonetDBtesting import process
+except ImportError:
+import process
+from MonetDBtesting.sqltest import SQLTestCase
+
+COUNT_QUERY = "SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';"
+
+sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+sock.bind(('', 0))
+port = sock.getsockname()[1]
+sock.close()
+
+# Make sure that using a strimp returns the same number of rows as
+# not using it.
+
+with tempfile.TemporaryDirectory() as farm_dir:
+fdir = os.path.join(farm_dir, 'db1')
+os.mkdir(fdir)
+with process.server(mapiport=port, dbname='db1',
+dbfarm=fdir,
+ 

MonetDB: string_imprints - Automated merge with ssh://dev.monetd...

2021-11-17 Thread Panagiotis Koutsourakis
Changeset: caf72b9fc8d4 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/caf72b9fc8d4
Branch: string_imprints
Log Message:

Automated merge with ssh://dev.monetdb.org/MonetDB


diffs (192 lines):

diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c
--- a/gdk/gdk_align.c
+++ b/gdk/gdk_align.c
@@ -378,7 +378,6 @@ VIEWdestroy(BAT *b)
IMPSdestroy(b);
OIDXdestroy(b);
PROPdestroy(b);
-   STRMPdestroy(b);
VIEWunlink(b);
 
MT_lock_set(>theaplock);
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -229,6 +229,8 @@ void IMPSincref(Imprints *imprints)
 void IMPSprint(BAT *b) /* never called: for debugging only */
__attribute__((__cold__));
 #endif
+void STRMPincref(Strimps *strimps)
+   __attribute__((__visibility__("hidden")));
 void STRMPdecref(Strimps *strimps, bool remove)
__attribute__((__visibility__("hidden")));
 void STRMPdestroy(BAT *b)
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -447,6 +447,8 @@ BATcheckstrimps(BAT *b)
hp->bitstrings_base = 
hp->strimps.base + hsize;/* bitmasks just after the pairs */
 
close(fd);
+   ATOMIC_INIT(>strimps.refs, 
1);
+   // STRMPincref(hp);
hp->strimps.parentid = 
b->batCacheid;
b->tstrimps = hp;
TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
@@ -499,12 +501,16 @@ STRMPfilter(BAT *b, BAT *s, const str q)
BAT *pb = BBP_cache(VIEWtparent(b));
if (!BATcheckstrimps(pb))
goto sfilter_fail;
+   MT_lock_set(>batIdxLock);
strmps = pb->tstrimps;
+   MT_lock_unset(>batIdxLock);
}
else {
if (!BATcheckstrimps(b))
goto sfilter_fail;
+   MT_lock_set(>batIdxLock);
strmps = b->tstrimps;
+   MT_lock_unset(>batIdxLock);
}
 
ncand = canditer_init(, b, s);
@@ -666,7 +672,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
r->bitstrings_base = h2;
r->strimps.free = sz;
r->rec_cnt = 0;
-
+   ATOMIC_INIT(>strimps.refs, 1);
}
return r;
 }
@@ -677,6 +683,7 @@ STRMPcreate(BAT *b, BAT *s)
lng t0 = 0;
BAT *pb;
 
+   MT_thread_setalgorithm("create strimp index");
TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
if (ATOMstorage(b->ttype) != TYPE_str) {
GDKerror("Cannot create strimps index for non string bats\n");
@@ -709,7 +716,6 @@ STRMPcreate(BAT *b, BAT *s)
 MT_lock_unset(>batIdxLock);
return GDK_FAIL;
 }
-   HEAPincref(>strimps);
dh = (uint64_t *)r->bitstrings_base;
 
/* Compute bitstrings */
@@ -789,20 +795,82 @@ STRMPappendBitstring(BAT *b, const str s
 }
 
 void
+STRMPbatdecref(BAT *b, bool remove)
+{
+   Strimps *strimps;
+   BAT *pb = NULL;
+
+   if (VIEWtparent(b)) {
+   pb = BBP_cache(VIEWtparent(b));
+   assert(pb);
+   } else {
+   pb = b;
+   }
+
+   MT_lock_set(>batIdxLock);
+   if (pb && pb->tstrimps && pb->tstrimps != (Strimps *)1) {
+   strimps = pb->tstrimps;
+   } else {
+   MT_lock_unset(>batIdxLock);
+   return;
+   }
+   STRMPdecref(strimps, remove);
+   MT_lock_unset(>batIdxLock);
+}
+
+void
+STRMPbatincref(BAT *b)
+{
+   Strimps *strimps;
+   BAT *pb = NULL;
+
+   if (VIEWtparent(b)) {
+   pb = BBP_cache(VIEWtparent(b));
+   assert(pb);
+   } else {
+   pb = b;
+   }
+
+   MT_lock_set(>batIdxLock);
+   if (pb && pb->tstrimps && pb->tstrimps != (Strimps *)1) {
+   strimps = pb->tstrimps;
+   } else {
+   MT_lock_unset(>batIdxLock);
+   return;
+   }
+   STRMPincref(strimps);
+   MT_lock_unset(>batIdxLock);
+
+}
+
+void
 STRMPdecref(Strimps *strimps, bool remove)
 {
+   TRC_DEBUG(ACCELERATOR, "Decrement ref count of %s to " ULLFMT "\n",
+ strimps->strimps.filename, ATOMIC_GET(>strimps.refs) 
- 1);
strimps->strimps.remove |= remove;
if (ATOMIC_DEC(>strimps.refs) == 0) {
ATOMIC_DESTROY(>strimps.refs);
HEAPfree(>strimps, strimps->strimps.remove);
GDKfree(strimps);
}
+
+}
+
+void
+STRMPincref(Strimps *strimps)
+{
+   

MonetDB: string_imprints - Take a lock before getting the strimp

2021-11-17 Thread Panagiotis Koutsourakis
Changeset: f7d7df6b897a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/f7d7df6b897a
Modified Files:
gdk/gdk_align.c
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Take a lock before getting the strimp

This needs some more thought


diffs (49 lines):

diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c
--- a/gdk/gdk_align.c
+++ b/gdk/gdk_align.c
@@ -378,7 +378,6 @@ VIEWdestroy(BAT *b)
IMPSdestroy(b);
OIDXdestroy(b);
PROPdestroy(b);
-   STRMPdestroy(b);
VIEWunlink(b);
 
MT_lock_set(>theaplock);
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -501,12 +501,16 @@ STRMPfilter(BAT *b, BAT *s, const str q)
BAT *pb = BBP_cache(VIEWtparent(b));
if (!BATcheckstrimps(pb))
goto sfilter_fail;
+   MT_lock_set(>batIdxLock);
strmps = pb->tstrimps;
+   MT_lock_unset(>batIdxLock);
}
else {
if (!BATcheckstrimps(b))
goto sfilter_fail;
+   MT_lock_set(>batIdxLock);
strmps = b->tstrimps;
+   MT_lock_unset(>batIdxLock);
}
 
ncand = canditer_init(, b, s);
@@ -793,12 +797,17 @@ STRMPappendBitstring(BAT *b, const str s
 void
 STRMPdecref(Strimps *strimps, bool remove)
 {
+   TRC_DEBUG(ACCELERATOR, "Decrement ref count of %s to " ULLFMT "\n",
+ strimps->strimps.filename, ATOMIC_GET(>strimps.refs) 
- 1);
strimps->strimps.remove |= remove;
if (ATOMIC_DEC(>strimps.refs) == 0) {
ATOMIC_DESTROY(>strimps.refs);
HEAPfree(>strimps, strimps->strimps.remove);
GDKfree(strimps);
}
+
+}
+
 void
 STRMPincref(Strimps *strimps)
 {
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Approve test

2021-11-17 Thread Panagiotis Koutsourakis
Changeset: 838e562ffb2a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/838e562ffb2a
Modified Files:
clients/Tests/exports.stable.out
Branch: string_imprints
Log Message:

Approve test


diffs (12 lines):

diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -394,6 +394,8 @@ BUN SORTfnd(BAT *b, const void *v);
 BUN SORTfndfirst(BAT *b, const void *v);
 BUN SORTfndlast(BAT *b, const void *v);
 gdk_return STRMPappendBitstring(BAT *b, const str s);
+void STRMPbatdecref(BAT *, bool);
+void STRMPbatincref(BAT *);
 gdk_return STRMPcreate(BAT *b, BAT *s);
 BAT *STRMPfilter(BAT *b, BAT *s, const str q);
 MT_Id THRcreate(void (*f)(void *), void *arg, enum MT_thr_detach d, const char 
*name);
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Initialize heap ref counts on contstr...

2021-11-17 Thread Panagiotis Koutsourakis
Changeset: 53920a0f4f9a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/53920a0f4f9a
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Initialize heap ref counts on contstruction


diffs (53 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -447,6 +447,8 @@ BATcheckstrimps(BAT *b)
hp->bitstrings_base = 
hp->strimps.base + hsize;/* bitmasks just after the pairs */
 
close(fd);
+   ATOMIC_INIT(>strimps.refs, 
1);
+   // STRMPincref(hp);
hp->strimps.parentid = 
b->batCacheid;
b->tstrimps = hp;
TRC_DEBUG(ACCELERATOR, 
"BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b));
@@ -666,7 +668,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s)
r->bitstrings_base = h2;
r->strimps.free = sz;
r->rec_cnt = 0;
-
+   ATOMIC_INIT(>strimps.refs, 1);
}
return r;
 }
@@ -677,6 +679,7 @@ STRMPcreate(BAT *b, BAT *s)
lng t0 = 0;
BAT *pb;
 
+   MT_thread_setalgorithm("create strimp index");
TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
if (ATOMstorage(b->ttype) != TYPE_str) {
GDKerror("Cannot create strimps index for non string bats\n");
@@ -709,7 +712,6 @@ STRMPcreate(BAT *b, BAT *s)
 MT_lock_unset(>batIdxLock);
return GDK_FAIL;
 }
-   HEAPincref(>strimps);
dh = (uint64_t *)r->bitstrings_base;
 
/* Compute bitstrings */
@@ -810,6 +812,7 @@ void
 STRMPdestroy(BAT *b)
 {
if (b && b->tstrimps) {
+   TRC_DEBUG(ACCELERATOR, "Destroying strimp %s\n", 
b->tstrimps->strimps.filename);
MT_lock_set(>batIdxLock);
if (b->tstrimps == (Strimps *)1) {
b->tstrimps = NULL;
@@ -829,6 +832,7 @@ void
 STRMPfree(BAT *b)
 {
if (b && b->tstrimps) {
+   TRC_DEBUG(ACCELERATOR, "Freeing strimp for BAT %s\n", 
b->tstrimps->strimps.filename);
Strimps *s;
MT_lock_set(>batIdxLock);
if ((s = b->tstrimps) != NULL && s != (Strimps *)1) {
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Add STRMPincref gdk private function

2021-11-17 Thread Panagiotis Koutsourakis
Changeset: 3f012d974065 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3f012d974065
Modified Files:
gdk/gdk_private.h
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Add STRMPincref gdk private function


diffs (29 lines):

diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -229,6 +229,8 @@ void IMPSincref(Imprints *imprints)
 void IMPSprint(BAT *b) /* never called: for debugging only */
__attribute__((__cold__));
 #endif
+void STRMPincref(Strimps *strimps)
+   __attribute__((__visibility__("hidden")));
 void STRMPdecref(Strimps *strimps, bool remove)
__attribute__((__visibility__("hidden")));
 void STRMPdestroy(BAT *b)
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -797,6 +797,13 @@ STRMPdecref(Strimps *strimps, bool remov
HEAPfree(>strimps, strimps->strimps.remove);
GDKfree(strimps);
}
+void
+STRMPincref(Strimps *strimps)
+{
+   TRC_DEBUG(ACCELERATOR, "Increment ref count of %s to " ULLFMT "\n",
+ strimps->strimps.filename, ATOMIC_GET(>strimps.refs) 
+ 1);
+   (void)ATOMIC_INC(>strimps.refs);
+
 }
 
 void
___
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list


MonetDB: string_imprints - Merge with default

2021-11-17 Thread Panagiotis Koutsourakis
Changeset: 99f9be40d724 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/99f9be40d724
Modified Files:
monetdb5/modules/mal/batExtensions.c
monetdb5/modules/mal/pcre.c
sql/backends/monet5/sql.c
sql/test/emptydb/Tests/check.stable.out
sql/test/emptydb/Tests/check.stable.out.int128
Branch: string_imprints
Log Message:

Merge with default


diffs (truncated from 1709 to 300 lines):

diff --git a/clients/mapiclient/dump.c b/clients/mapiclient/dump.c
--- a/clients/mapiclient/dump.c
+++ b/clients/mapiclient/dump.c
@@ -2391,7 +2391,7 @@ dump_database(Mapi mid, stream *toConsol
const char *sequences2 =
"SELECT s.name, "
 "seq.name, "
-"get_value_for(s.name, seq.name), "
+"peak_next_value_for(s.name, seq.name), "
 "seq.\"minvalue\", "
 "seq.\"maxvalue\", "
 "seq.\"increment\", "
diff --git a/common/utils/matomic.h b/common/utils/matomic.h
--- a/common/utils/matomic.h
+++ b/common/utils/matomic.h
@@ -57,7 +57,6 @@
 /* #define NO_ATOMIC_INSTRUCTIONS */
 
 /* the atomic type we export is always a 64 bit unsigned integer */
-typedef uint64_t ATOMIC_BASE_TYPE;
 
 /* ignore __STDC_NO_ATOMICS__ if compiling using Intel compiler on
  * Windows since otherwise we can't compile this at all in C99 mode */
@@ -67,8 +66,10 @@ typedef uint64_t ATOMIC_BASE_TYPE;
 
 #if SIZEOF_LONG_LONG == 8
 typedef volatile atomic_ullong ATOMIC_TYPE;
+typedef unsigned long long ATOMIC_BASE_TYPE;
 #elif SIZEOF_LONG == 8
 typedef volatile atomic_ulong ATOMIC_TYPE;
+typedef unsigned long ATOMIC_BASE_TYPE;
 #else
 #error "we need a 64 bit atomic type"
 #endif
@@ -106,6 +107,8 @@ typedef volatile atomic_flag ATOMIC_FLAG
 
 #elif defined(_MSC_VER) && !defined(NO_ATOMIC_INSTRUCTIONS)
 
+typedef uint64_t ATOMIC_BASE_TYPE;
+
 #include 
 
 /* On Windows, with Visual Studio 2005, the compiler uses acquire
@@ -227,6 +230,7 @@ typedef volatile int ATOMIC_FLAG;
 /* the new way of doing this according to GCC (the old way, using
  * __sync_* primitives is not supported) */
 
+typedef uint64_t ATOMIC_BASE_TYPE;
 typedef volatile ATOMIC_BASE_TYPE ATOMIC_TYPE;
 
 #define ATOMIC_VAR_INIT(val)   (val)
@@ -262,6 +266,8 @@ typedef volatile char ATOMIC_FLAG;
 
 /* emulate using mutexes */
 
+typedef uint64_t ATOMIC_BASE_TYPE;
+
 #include  /* required for pthread_mutex_t */
 
 typedef struct {
diff --git a/monetdb5/mal/mal_client.c b/monetdb5/mal/mal_client.c
--- a/monetdb5/mal/mal_client.c
+++ b/monetdb5/mal/mal_client.c
@@ -598,7 +598,7 @@ MCreadClient(Client c)
in->pos++;
 
if (in->pos >= in->len || in->mode) {
-   ssize_t rd, sum = 0;
+   ssize_t rd;
 
if (in->eof || !isa_block_stream(c->fdout)) {
if (!isa_block_stream(c->fdout) && c->promptlength > 0)
@@ -607,7 +607,6 @@ MCreadClient(Client c)
in->eof = false;
}
while ((rd = bstream_next(in)) > 0 && !in->eof) {
-   sum += rd;
if (!in->mode) /* read one line at a time in line mode 
*/
break;
}
diff --git a/monetdb5/modules/mal/batExtensions.c 
b/monetdb5/modules/mal/batExtensions.c
--- a/monetdb5/modules/mal/batExtensions.c
+++ b/monetdb5/modules/mal/batExtensions.c
@@ -100,7 +100,7 @@ CMDBATsingle(Client cntxt, MalBlkPtr mb,
if( b == 0)
throw(MAL,"bat.single", SQLSTATE(HY013) MAL_MALLOC_FAIL);
if (ATOMextern(b->ttype))
-   u = (ptr) *(str *)u;
+   u = (ptr) *(ptr *)u;
if (BUNappend(b, u, false) != GDK_SUCCEED) {
BBPreclaim(b);
throw(MAL, "bat.single", SQLSTATE(HY013) MAL_MALLOC_FAIL);
diff --git a/monetdb5/modules/mal/mal_io.c b/monetdb5/modules/mal/mal_io.c
--- a/monetdb5/modules/mal/mal_io.c
+++ b/monetdb5/modules/mal/mal_io.c
@@ -120,8 +120,8 @@ IOprintBoth(Client cntxt, MalBlkPtr mb, 
if (hd)
mnstr_printf(fp, "%s", hd);
 
-   if (ATOMvarsized(tpe))
-   ATOMprint(tpe, *(str *) val, fp);
+   if (ATOMextern(tpe))
+   ATOMprint(tpe, *(ptr *) val, fp);
else
ATOMprint(tpe, val, fp);
 
diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c
--- a/monetdb5/modules/mal/pcre.c
+++ b/monetdb5/modules/mal/pcre.c
@@ -285,8 +285,6 @@ mywstrcasestr(const char *restrict hayst
if (nlen == 0)
return atend ? haystack + strlen(haystack) : haystack;
 
-   size_t hlen = strlen(haystack);
-
while (*haystack) {
size_t i;
size_t h;
@@ -312,7 +310,6 @@ mywstrcasestr(const char *restrict hayst
if (i == nlen && (!atend || haystack[h] == 0))
return haystack;
haystack += step;
-   hlen -= step;
   

<    3   4   5   6   7   8   9   10   11   >