MonetDB: string_imprints - Read and write the descriptor correctly
Changeset: a2c6fcd81f79 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/a2c6fcd81f79 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Read and write the descriptor correctly diffs (130 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -138,7 +138,7 @@ static int8_t strimp_lookup(Strimps *s, CharPair *p) { int8_t ret = -1; size_t idx = 0; - size_t npairs = NPAIRS((uint64_t)s->strimps.base[0]); + size_t npairs = NPAIRS(((uint64_t *)s->strimps.base)[0]); size_t offset = 0; CharPair sp; (void)p; @@ -181,7 +181,7 @@ STRMPmakebitstring(const str s, Strimps pi.lim = strlen(s); while(pair_at(, )) { - pair_idx = strimp_lookup(r, ); + pair_idx = STRMPpairLookup(r, ); if (pair_idx > 0) ret |= 0x1 << pair_idx; next_pair(); @@ -190,8 +190,6 @@ STRMPmakebitstring(const str s, Strimps return ret; } - - /* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest * counts. * @@ -321,6 +319,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair if(hist[hidx].p) { GDKfree(hist[hidx].p->pbytes); GDKfree(hist[hidx].p); + hist[hidx].p = NULL; } } GDKfree(hist); @@ -333,7 +332,6 @@ STRMPbuildHeader(BAT *b, CharPair *hpair static Strimps * STRMPcreateStrimp(BAT *b) { - uint64_t *d; uint8_t *h1, *h2; Strimps *r = NULL; uint64_t descriptor; @@ -354,16 +352,15 @@ STRMPcreateStrimp(BAT *b) if ((r = GDKzalloc(sizeof(Strimps))) == NULL || (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 || strconcat_len(r->strimps.filename, sizeof(r->strimps.filename), - nme, ".strimp", NULL) >= sizeof(r->strimps.filename) || + nme, ".tstrimps", NULL) >= sizeof(r->strimps.filename) || HEAPalloc(>strimps, BATcount(b)*sizeof(uint64_t) + sz, sizeof(uint8_t), 0) != GDK_SUCCEED) { GDKfree(r); return NULL; } - descriptor = STRIMP_VERSION | STRIMP_HEADER_SIZE << 8 | ((uint64_t)sz) << 16; + descriptor = STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) << 8 | ((uint64_t)sz) << 16; - d = (uint64_t *)r->strimps.base; - *d = descriptor; + ((uint64_t *)r->strimps.base)[0] = descriptor; r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8; r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE; @@ -505,7 +502,6 @@ persistStrimp(BAT *b) if((BBP_status(b->batCacheid) & BBPEXISTING) && b->batInserted == b->batCount) } -#endif /* Create */ gdk_return @@ -521,31 +517,32 @@ STRMPcreate(BAT *b) assert(b->ttype == TYPE_str); TRC_DEBUG_IF(ALGO) t0 = GDKusec(); - if (b->tstrimps == NULL) { - if ((h = STRMPcreateStrimp(b)) == NULL) { - return GDK_FAIL; - } - dh = (uint64_t *)h->strimps.base + h->strimps.free; + if (BATcheckstrimps(b)) + return GDK_SUCCEED; + + if ((h = STRMPcreateStrimp(b)) == NULL) { + return GDK_FAIL; + } + dh = (uint64_t *)h->strimps.base + h->strimps.free; - bi = bat_iterator(b); - for (i = 0; i < b->batCount; i++) { - s = (str)BUNtvar(bi, i); - if (!strNil(s)) - *dh++ = STRMPmakebitstring(s, h); - else - *dh++ = 0; /* no pairs in nil values */ - } - h->strimps.free += b->batCount*sizeof(uint64_t); + bi = bat_iterator(b); + for (i = 0; i < b->batCount; i++) { + s = (str)BUNtvar(bi, i); + if (!strNil(s)) + *dh++ = STRMPmakebitstring(s, h); + else + *dh++ = 0; /* no pairs in nil values */ + } + h->strimps.free += b->batCount*sizeof(uint64_t); - /* After we have computed the strimp, attempt to write it back -* to the BAT. -*/ - MT_lock_set(>batIdxLock); - b->tstrimps = h; - b->batDirtydesc = true; - /* persistStrimp(b) */ - MT_lock_unset(>batIdxLock); - } + /* After we have computed the strimp, attempt to write it back +* to the BAT. +*/ + MT_lock_set(>batIdxLock); + b->tstrimps = h; + b->batDirtydesc = true; + persistStrimp(b); + MT_lock_unset(>batIdxLock); TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0); return GDK_SUCCEED;
MonetDB: Jul2021 - Merge heads
Changeset: 2f44594a914e for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/2f44594a914e Modified Files: sql/server/sql_scan.c Branch: Jul2021 Log Message: Merge heads diffs (truncated from 5186 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -9170,6 +9170,7 @@ stdout of test 'MAL-signatures` in direc [ "rapi", "eval_aggr","pattern rapi.eval_aggr(X_0:ptr, X_1:str, X_2:any...):any... ", "RAPIevalAggr;","" ] [ "rapi", "prelude", "command rapi.prelude():void ", "RAPIprelude;", "" ] [ "rapi", "subeval_aggr", "pattern rapi.subeval_aggr(X_0:ptr, X_1:str, X_2:any...):any... ", "RAPIevalAggr;","" ] +[ "remote","assert", "pattern remote.assert(X_0:bit, X_1:str):void ","RMTassert;", "" ] [ "remote","batbincopy", "pattern remote.batbincopy():bat[:any] ", "RMTbincopyfrom;", "" ] [ "remote","batbincopy", "pattern remote.batbincopy(X_0:bat[:any]):void ", "RMTbincopyto;","" ] [ "remote","batload", "pattern remote.batload(X_0:any_1, X_1:int):bat[:any_1] ", "RMTbatload;", "" ] @@ -9271,6 +9272,7 @@ stdout of test 'MAL-signatures` in direc [ "sql", "deltas", "pattern sql.deltas(X_0:str, X_1:str) (X_2:bat[:int], X_3:bat[:lng], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], X_7:bat[:lng], X_8:bat[:int]) ", "mvc_delta_values;","" ] [ "sql", "deltas", "pattern sql.deltas(X_0:str, X_1:str, X_2:str) (X_3:bat[:int], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], X_7:bat[:lng], X_8:bat[:lng], X_9:bat[:int]) ", "mvc_delta_values;","" ] [ "sql", "dense_rank", "pattern sql.dense_rank(X_0:any_1, X_1:bit, X_2:bit):int ", "SQLdense_rank;", "" ] +[ "sql", "deregister", "pattern sql.deregister():int ", "RAstatementEnd;", "" ] [ "sql", "diff", "pattern sql.diff(X_0:any_1):bit ", "SQLdiff;", "" ] [ "sql", "diff", "pattern sql.diff(X_0:bit, X_1:any_1):bit ", "SQLdiff;", "" ] [ "sql", "drop_hash","unsafe pattern sql.drop_hash(X_0:str, X_1:str):void ", "SQLdrop_hash;","" ] diff --git a/clients/Tests/MAL-signatures.stable.out.int128 b/clients/Tests/MAL-signatures.stable.out.int128 --- a/clients/Tests/MAL-signatures.stable.out.int128 +++ b/clients/Tests/MAL-signatures.stable.out.int128 @@ -12471,6 +12471,7 @@ stdout of test 'MAL-signatures` in direc [ "rapi", "eval_aggr","pattern rapi.eval_aggr(X_0:ptr, X_1:str, X_2:any...):any... ", "RAPIevalAggr;","" ] [ "rapi", "prelude", "command rapi.prelude():void ", "RAPIprelude;", "" ] [ "rapi", "subeval_aggr", "pattern rapi.subeval_aggr(X_0:ptr, X_1:str, X_2:any...):any... ", "RAPIevalAggr;","" ] +[ "remote","assert", "pattern remote.assert(X_0:bit, X_1:str):void ","RMTassert;", "" ] [ "remote","batbincopy", "pattern remote.batbincopy():bat[:any] ", "RMTbincopyfrom;", "" ] [ "remote","batbincopy", "pattern remote.batbincopy(X_0:bat[:any]):void ", "RMTbincopyto;","" ] [ "remote","batload", "pattern remote.batload(X_0:any_1, X_1:int):bat[:any_1] ", "RMTbatload;", "" ] @@ -12577,6 +12578,7 @@ stdout of test 'MAL-signatures` in direc [ "sql", "deltas", "pattern sql.deltas(X_0:str, X_1:str) (X_2:bat[:int], X_3:bat[:lng], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], X_7:bat[:lng], X_8:bat[:int]) ", "mvc_delta_values;","" ] [ "sql", "deltas", "pattern sql.deltas(X_0:str, X_1:str, X_2:str) (X_3:bat[:int], X_4:bat[:lng], X_5:bat[:lng], X_6:bat[:lng], X_7:bat[:lng], X_8:bat[:lng], X_9:bat[:int]) ", "mvc_delta_values;","" ] [ "sql", "dense_rank", "pattern sql.dense_rank(X_0:any_1, X_1:bit, X_2:bit):int ", "SQLdense_rank;", "" ] +[ "sql", "deregister", "pattern sql.deregister():int ", "RAstatementEnd;", "" ] [ "sql", "diff", "pattern sql.diff(X_0:any_1):bit ", "SQLdiff;", "" ] [ "sql", "diff", "pattern sql.diff(X_0:bit, X_1:any_1):bit ", "SQLdiff;", "" ] [ "sql", "drop_hash","unsafe pattern sql.drop_hash(X_0:str, X_1:str):void ", "SQLdrop_hash;","" ] diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -937,6 +937,7 @@ const char *deleteRef; void deleteSymbol(Module scope, Symbol prg); const char *deltaRef; const char *dense_rankRef; +const char *deregisterRef; malType destinationType(MalBlkPtr mb, InstrPtr p); const char *diffRef; const char *diffcandRef; diff --git
MonetDB: Jul2021 - Avoid looking up raw_strings variable on ever...
Changeset: 119723724e8a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/119723724e8a Modified Files: sql/server/sql_scan.c sql/server/sql_scan.h Branch: Jul2021 Log Message: Avoid looking up raw_strings variable on every read We keep a bit in the sql scanner that controls whether it reads raw strings or not. It is initialized using the raw_strings setting/property. diffs (149 lines): diff --git a/sql/server/sql_scan.c b/sql/server/sql_scan.c --- a/sql/server/sql_scan.c +++ b/sql/server/sql_scan.c @@ -528,6 +528,7 @@ scanner_init(struct scanner *s, bstream .rs = rs, .ws = ws, .mode = LINE_N, + .raw_string_mode = GDKgetenv_istrue("raw_strings"), }; } @@ -985,11 +986,9 @@ int scanner_symbol(mvc * c, int cur) return cur; return tokenize(c, cur); case '\'': -#ifdef SQL_STRINGS_USE_ESCAPES - if (lc->next_string_is_raw || GDKgetenv_istrue("raw_strings")) + if (lc->raw_string_mode || lc->next_string_is_raw) return scanner_string(c, cur, false); return scanner_string(c, cur, true); -#endif case '"': return scanner_string(c, cur, false); case '{': @@ -1272,9 +1271,7 @@ sql_get_next_token(YYSTYPE *yylval, void if (token == IDENT || token == COMPARISON || token == RANK || token == aTYPE || token == ALIAS) { yylval->sval = sa_strndup(c->sa, yylval->sval, lc->yycur-lc->yysval); -#ifdef SQL_STRINGS_USE_ESCAPES lc->next_string_is_raw = false; -#endif } else if (token == STRING) { char quote = *yylval->sval; char *str = sa_alloc( c->sa, (lc->yycur-lc->yysval-2)*2 + 1 ); @@ -1307,9 +1304,7 @@ sql_get_next_token(YYSTYPE *yylval, void strcpy(str, yylval->sval + 3); token = yylval->sval[2] == '\'' ? USTRING : UIDENT; quote = yylval->sval[2]; -#ifdef SQL_STRINGS_USE_ESCAPES lc->next_string_is_raw = true; -#endif break; case 'x': case 'X': @@ -1321,9 +1316,7 @@ sql_get_next_token(YYSTYPE *yylval, void *dst = 0; quote = '\''; token = XSTRING; -#ifdef SQL_STRINGS_USE_ESCAPES lc->next_string_is_raw = true; -#endif break; case 'r': case 'R': @@ -1336,9 +1329,7 @@ sql_get_next_token(YYSTYPE *yylval, void *dst = 0; break; default: -#ifdef SQL_STRINGS_USE_ESCAPES - if (GDKgetenv_istrue("raw_strings") || - lc->next_string_is_raw) { + if (lc->raw_string_mode || lc->next_string_is_raw) { dst = str; for (char *src = yylval->sval + 1; *src; dst++) if ((*dst = *src++) == '\'' && *src == '\'') @@ -1349,23 +1340,14 @@ sql_get_next_token(YYSTYPE *yylval, void (unsigned char *)yylval->sval + 1, lc->yycur - lc->yysval - 1); } -#else - dst = str; - for (char *src = yylval->sval + 1; *src; dst++) - if ((*dst = *src++) == '\'' && *src == '\'') - src++; - *dst = 0; -#endif break; } yylval->sval = str; /* reset original */ lc->rs->buf[lc->rs->pos+lc->yycur- 1] = quote; -#ifdef SQL_STRINGS_USE_ESCAPES } else { lc->next_string_is_raw = false; -#endif } return(token); diff --git a/sql/server/sql_scan.h b/sql/server/sql_scan.h --- a/sql/server/sql_scan.h +++ b/sql/server/sql_scan.h @@ -15,11 +15,6 @@ typedef enum { LINE_1, LINE_N } prot; -/* Currently, MonetDB interprets \ specially in strings. This is - * contrary to the SQL standard. Remove this define to revert to the - * standard interpretation. */ -#define SQL_STRINGS_USE_ESCAPES 1 - struct scanner { bstream *rs; stream *ws; @@ -36,15 +31,31 @@ struct scanner { prot mode; /* which mode (line (1,N), blocked) */ char *schema; /* Keep schema name of create statement, needed AUTO_INCREMENT, SERIAL */ char *errstr; /* error message from the bowels of the scanner */ -#ifdef SQL_STRINGS_USE_ESCAPES - /* because we interpret \ in strings, we need state in the -* scanner so that we Do The Right Thing (TM) when we get a -* unicode string split up in multiple parts (i.e.
MonetDB: string_imprints - Merge with default
Changeset: 80f037721006 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/80f037721006 Modified Files: gdk/gdk_private.h Branch: string_imprints Log Message: Merge with default diffs (truncated from 5721 to 300 lines): diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -541,8 +541,8 @@ gdk_return log_bat_transient(logger *lg, gdk_return log_constant(logger *lg, int type, ptr val, log_id id, lng offset, lng cnt); gdk_return log_delta(logger *lg, BAT *uid, BAT *uval, log_id id); gdk_return log_sequence(logger *lg, int seq, lng id); -gdk_return log_tend(logger *lg); -gdk_return log_tstart(logger *lg, ulng commit_ts, bool flush); +gdk_return log_tend(logger *lg, ulng commit_ts); +gdk_return log_tstart(logger *lg, bool flush); gdk_return logger_activate(logger *lg); lng logger_changes(logger *lg); logger *logger_create(int debug, const char *fn, const char *logdir, int version, preversionfix_fptr prefuncp, postversionfix_fptr postfuncp, void *funcdata); @@ -761,6 +761,7 @@ void MCcloseClient(Client c); Client MCforkClient(Client father); Client MCgetClient(int id); Client MCinitClient(oid user, bstream *fin, stream *fout); +size_t MCmemoryClaim(void); int MCpushClientInput(Client c, bstream *new_input, int listing, char *prompt); void MCstopClients(Client c); str MCsuspendClient(int id); diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -809,12 +809,12 @@ COLcopy(BAT *b, int tt, bool writable, r /* first try case (1); create a view, possibly with different * atom-types */ - if (role == b->batRole && + if (!writable && + role == b->batRole && b->batRestricted == BAT_READ && ATOMstorage(b->ttype) != TYPE_msk && /* no view on TYPE_msk */ (!VIEWtparent(b) || -BBP_cache(VIEWtparent(b))->batRestricted == BAT_READ) && - !writable) { +BBP_cache(VIEWtparent(b))->batRestricted == BAT_READ)) { bn = VIEWcreate(b->hseqbase, b); if (bn == NULL) return NULL; @@ -875,8 +875,8 @@ COLcopy(BAT *b, int tt, bool writable, r strconcat_len(thp.filename, sizeof(thp.filename), BBP_physical(bn->batCacheid), ".theap", NULL); - if ((b->ttype && HEAPcopy(, b->theap) != GDK_SUCCEED) || - (bn->tvheap && HEAPcopy(, b->tvheap) != GDK_SUCCEED)) { + if ((b->ttype && HEAPcopy(, b->theap, b->tbaseoff << b->tshift) != GDK_SUCCEED) || + (bn->tvheap && HEAPcopy(, b->tvheap, 0) != GDK_SUCCEED)) { HEAPfree(, true); HEAPfree(, true); BBPreclaim(bn); diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -32,7 +32,7 @@ unshare_varsized_heap(BAT *b) h->farmid = BBPselectfarm(b->batRole, TYPE_str, varheap); strconcat_len(h->filename, sizeof(h->filename), BBP_physical(b->batCacheid), ".theap", NULL); - if (HEAPcopy(h, b->tvheap) != GDK_SUCCEED) { + if (HEAPcopy(h, b->tvheap, 0) != GDK_SUCCEED) { HEAPfree(h, true); GDKfree(h); return GDK_FAIL; @@ -496,7 +496,7 @@ append_varsized_bat(BAT *b, BAT *n, stru h->farmid = BBPselectfarm(b->batRole, b->ttype, varheap); strconcat_len(h->filename, sizeof(h->filename), BBP_physical(b->batCacheid), ".theap", NULL); - if (HEAPcopy(h, b->tvheap) != GDK_SUCCEED) { + if (HEAPcopy(h, b->tvheap, 0) != GDK_SUCCEED) { HEAPfree(h, true); GDKfree(h); return GDK_FAIL; diff --git a/gdk/gdk_group.c b/gdk/gdk_group.c --- a/gdk/gdk_group.c +++ b/gdk/gdk_group.c @@ -453,41 +453,43 @@ rev(oid x) return x; } -/* population count: count number of 1 bits in a value */ -static inline int -pop(oid x) +/* count trailing zeros, also see candmask_lobit in gdk_cand.h */ +static inline int __attribute__((__const__)) +ctz(oid x) { -#ifdef __GNUC__ +#if defined(__GNUC__) #if SIZEOF_OID == SIZEOF_INT - return __builtin_popcount(x); + return __builtin_ctz(x); #else - return __builtin_popcountl(x); + return __builtin_ctzl(x); #endif -#else -#ifdef _MSC_VER +#elif defined(_MSC_VER) #if SIZEOF_OID == SIZEOF_INT - return (int) __popcnt((unsigned int) (x)); -#else - return (int) __popcnt64((unsigned __int64) (x)); -#endif + unsigned long idx; + if (_BitScanForward(, (unsigned long) x)) +
MonetDB: string_imprints - Fix strimp generation bugs
Changeset: a7567eea4081 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/a7567eea4081 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix strimp generation bugs diffs (55 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -133,12 +133,10 @@ next_pair(PairIterator *pi) { static int8_t STRMPpairLookup(Strimps *s, CharPair *p) { - int8_t ret = -1; size_t idx = 0; size_t npairs = NPAIRS(((uint64_t *)s->strimps.base)[0]); size_t offset = 0; CharPair sp; - (void)p; for (idx = 0; idx < npairs; idx++) { sp.psize = s->sizes_base[idx]; @@ -148,7 +146,7 @@ STRMPpairLookup(Strimps *s, CharPair *p) offset += sp.psize; } - return ret; + return -1; } static bool @@ -160,8 +158,8 @@ ignored(CharPair *p, uint8_t elm) { #define MAX_PAIR_SIZE 8 /* Given a strimp header and a string compute the bitstring of which - * digrams(byte pairs) are present in the string. The strimp header is a - * map from digram(byte pair) to index in the strimp. + * digrams are present in the string. The strimp header is a map from + * digram to index in the strimp. * * This should probably be inlined. */ @@ -179,8 +177,8 @@ STRMPmakebitstring(const str s, Strimps while(pair_at(, )) { pair_idx = STRMPpairLookup(r, ); - if (pair_idx > 0) - ret |= 0x1 << pair_idx; + if (pair_idx >= 0) + ret |= ((uint64_t)0x1 << pair_idx); next_pair(); } @@ -617,7 +615,6 @@ STRMPcreate(BAT *b) } /* Left over code */ - #if 0 /* This counts how many unicode codepoints the given string * contains. ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: cffe5ff7bdad for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/cffe5ff7bdad Modified Files: gdk/gdk.h gdk/gdk_private.h Branch: string_imprints Log Message: Merge with default diffs (truncated from 125754 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -5752,7 +5752,7 @@ stdout of test 'MAL-signatures` in direc [ "batcalc", "timestamp","pattern batcalc.timestamp(X_0:bat[:timestamp], X_1:bat[:oid], X_2:int):bat[:timestamp] ", "timestamp_2time_timestamp;", "" ] [ "batcalc", "uuid", "command batcalc.uuid(X_0:bat[:str], X_1:bat[:oid]):bat[:uuid] ", "UUIDstr2uuid_bulk;", "" ] [ "batcalc", "uuid", "command batcalc.uuid(X_0:bat[:uuid], X_1:bat[:oid]):bat[:uuid] ", "UUIDuuid2uuid_bulk;", "" ] -[ "batcalc", "wkb", "command batcalc.wkb(X_0:bat[:wkb], X_1:int, X_2:int):bat[:wkb] ", "geom_2_geom_bat;", "" ] +[ "batcalc", "wkb", "command batcalc.wkb(X_0:bat[:wkb], X_1:bat[:oid], X_2:int, X_3:int):bat[:wkb] ", "geom_2_geom_bat;", "" ] [ "batcalc", "xml", "command batcalc.xml(X_0:bat[:str]):bat[:xml] ", "BATXMLstr2xml;", "" ] [ "batcalc", "xor", "pattern batcalc.xor(X_0:bat[:bit], X_1:bat[:bit]):bat[:bit] ", "CMDbatXOR;", "" ] [ "batcalc", "xor", "pattern batcalc.xor(X_0:bat[:bit], X_1:bat[:bit], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ", "CMDbatXOR;", "" ] @@ -9208,7 +9208,7 @@ stdout of test 'MAL-signatures` in direc [ "sql", "analyze", "unsafe pattern sql.analyze(X_0:int, X_1:lng, X_2:str, X_3:str):void ", "sql_analyze;", "" ] [ "sql", "analyze", "unsafe pattern sql.analyze(X_0:int, X_1:lng, X_2:str, X_3:str, X_4:str):void ","sql_analyze;", "" ] [ "sql", "any", "pattern sql.any(X_0:bit, X_1:bit, X_2:bit):bit ", "SQLany_cmp;", "" ] -[ "sql", "append", "pattern sql.append(X_0:int, X_1:str, X_2:str, X_3:str, X_4:lng, X_5:any):int ","mvc_append_wrap;", "" ] +[ "sql", "append", "pattern sql.append(X_0:int, X_1:str, X_2:str, X_3:str, X_4:bat[:oid], X_5:any):int ", "mvc_append_wrap;", "" ] [ "sql", "argRecord","pattern sql.argRecord():str ", "SQLargRecord;","" ] [ "sql", "argRecord","pattern sql.argRecord(X_0:any...):str ", "SQLargRecord;","" ] [ "sql", "assert", "pattern sql.assert(X_0:bit, X_1:str):void ", "SQLassert;", "" ] @@ -9232,7 +9232,7 @@ stdout of test 'MAL-signatures` in direc [ "sql", "bind_idxbat", "pattern sql.bind_idxbat(X_0:int, X_1:str, X_2:str, X_3:str, X_4:int):bat[:any_1] ","mvc_bind_idxbat_wrap;","" ] [ "sql", "bind_idxbat", "pattern sql.bind_idxbat(X_0:int, X_1:str, X_2:str, X_3:str, X_4:int, X_5:int, X_6:int) (X_7:bat[:oid], X_8:bat[:any_1]) ", "mvc_bind_idxbat_wrap;","" ] [ "sql", "bind_idxbat", "pattern sql.bind_idxbat(X_0:int, X_1:str, X_2:str, X_3:str, X_4:int, X_5:int, X_6:int):bat[:any_1] ", "mvc_bind_idxbat_wrap;","" ] -[ "sql", "claim","unsafe pattern sql.claim(X_0:int, X_1:str, X_2:str, X_3:lng):lng ","mvc_claim_wrap;", "" ] +[ "sql", "claim","unsafe pattern sql.claim(X_0:int, X_1:str, X_2:str, X_3:lng):bat[:oid] ", "mvc_claim_wrap;", "" ] [ "sql", "clear_table", "unsafe pattern sql.clear_table(X_0:str, X_1:str):lng ","mvc_clear_table_wrap;","" ] [ "sql", "commit", "unsafe pattern sql.commit():void ", "SQLcommit;", "" ] [ "sql", "copy_from","unsafe pattern sql.copy_from(X_0:ptr, X_1:str, X_2:str, X_3:str, X_4:str, X_5:str, X_6:lng, X_7:lng, X_8:int, X_9:str, X_10:int, X_11:int):bat[:any]... ", "mvc_import_table_wrap;", "" ] @@ -9244,6 +9244,7 @@ stdout of test 'MAL-signatures` in direc [ "sql", "corr", "pattern sql.corr(X_0:int, X_1:int, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcorr;", "" ] [ "sql", "corr", "pattern sql.corr(X_0:lng, X_1:lng, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcorr;", "" ] [ "sql", "corr", "pattern sql.corr(X_0:sht, X_1:sht, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcorr;", "" ] +[ "sql", "count","pattern sql.count(X_0:str, X_1:str):lng ", "SQLbasecount;","" ] [ "sql", "count","pattern sql.count(X_0:any_1, X_1:bit, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):lng ", "SQLcount;","" ] [ "sql", "covariance", "pattern sql.covariance(X_0:bte, X_1:bte, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcovar_samp;", "" ] [ "sql", "covariance",
MonetDB: string_imprints - Initial implementation of the strimp ...
Changeset: 4ad4318de13e for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/4ad4318de13e Added Files: sql/scripts/90_strimps.sql Modified Files: gdk/gdk_strimps.c monetdb5/modules/mal/batExtensions.c sql/backends/monet5/CMakeLists.txt sql/scripts/CMakeLists.txt Branch: string_imprints Log Message: Initial implementation of the strimp filter diffs (145 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -455,6 +455,7 @@ STRMPfilter(BAT *b, char *q) BUN i; uint64_t qbmask; uint64_t *ptr; + int zz = 0; if (b->tstrimps == NULL) @@ -471,15 +472,19 @@ STRMPfilter(BAT *b, char *q) qbmask = STRMPmakebitstring(q, b->tstrimps); ptr = (uint64_t *)b->tstrimps->strimps_base; - for (i = 0; i < b->batCount; i++) { - if ((*ptr & qbmask) == qbmask) { + if ((*(ptr + i) & qbmask) == qbmask) { oid pos = i; if (BUNappend(r, , false) != GDK_SUCCEED) goto sfilter_fail; } + else { + zz++; + } } + printf("filtered out: %d entries\n", zz); + r->tkey = true; return virtualize(r); diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -340,7 +340,7 @@ PATstrimp_makehist(Client cntxt, MalBlkP } #endif static str -PATstrimp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +PATstrimpCreate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { bat bid; BAT *b; @@ -358,6 +358,50 @@ PATstrimp(Client cntxt, MalBlkPtr mb, Ma return MAL_SUCCEED; } +static str +PATstrimpFilter(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { + (void)cntxt; + (void)mb; + (void)stk; + (void)pci; + throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) "UNIMPLEMENTED"); +} + +static str +PATstrimpFilterSelect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + bat bid, sid; + BAT *b, *s, *ob; + str pat; + + (void)cntxt; + (void)mb; + + bid = *getArgReference_bat(stk, pci, 1); + if ((b = BATdescriptor(bid)) == NULL) + throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + + sid = *getArgReference_bat(stk, pci, 2); + if ((s = BATdescriptor(sid)) == NULL) + throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + + assert(s->ttype == TYPE_void); + + if (!STRMPcreate(b)) { + throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) OPERATION_FAILED); + } + + pat = *getArgReference_str(stk, pci, 3); + if ((ob = STRMPfilter(b, pat)) == NULL) { + BBPunfix(b->batCacheid); + throw(MAL, "bat.strimpfilter", SQLSTATE(HY002)); + } + + *getArgReference_bat(stk, pci, 0) = ob->batCacheid; + BBPkeepref(ob->batCacheid); + + return MAL_SUCCEED; +} #include "mel.h" mel_func batExtensions_init_funcs[] = { @@ -392,7 +436,10 @@ mel_func batExtensions_init_funcs[] = { /* String imprints */ // pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams in a string bat", args(1,2,arg("",lng),batarg("b",str))), // pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), batarg("",lng),batarg("b",str))), - pattern("bat", "strimp", PATstrimp, false, "construct the strimp a BAT", args(1,2,arg("",void),batarg("b",str))), + pattern("bat", "mkstrimp", PATstrimpCreate, false, "construct the strimp a BAT", args(1,2,arg("",void),batarg("b",str))), + pattern("bat", "strimpfilter", PATstrimpFilter, false, "", args(1,3,arg("",bit),arg("b",str),arg("q",str))), + pattern("bat", "strimpfilterselect", PATstrimpFilterSelect, false, "", args(1,5,batarg("",oid),batarg("b",str),batarg("s",oid),arg("q",str),arg("a",bit))), + pattern("bat", "strimpfilterjoin", PATstrimpFilter, false, "", args(2,8,batarg("",oid),batarg("b",str),arg("q",str))), { .imp=NULL } }; #include "mal_import.h" diff --git a/sql/backends/monet5/CMakeLists.txt b/sql/backends/monet5/CMakeLists.txt --- a/sql/backends/monet5/CMakeLists.txt +++ b/sql/backends/monet5/CMakeLists.txt @@ -40,7 +40,8 @@ set(include_sql_files 75_storagemodel 76_dump 80_statistics - 81_tracer) + 81_tracer + 90_strimps) if(HAVE_HGE) list(APPEND include_sql_files diff --git a/sql/scripts/90_strimps.sql b/sql/scripts/90_strimps.sql new file mode 100644 --- /dev/null +++ b/sql/scripts/90_strimps.sql @@ -0,0 +1,8 @@ +create schema strimps; + +-- create procedure strimps.strmpcreate(b string) +-- external name bat.strimpCreate; +-- grant execute on procedure
MonetDB: string_imprints - Initial sql interface
Changeset: 0713d2b9a640 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/0713d2b9a640 Added Files: sql/backends/monet5/sql_strimps.c sql/backends/monet5/sql_strimps.h Modified Files: sql/backends/monet5/CMakeLists.txt sql/backends/monet5/sql.c sql/scripts/90_strimps.sql Branch: string_imprints Log Message: Initial sql interface To use, first construct the strimp by specifying the full column name: sql>call createstrimps('schema', 'table', 'column'); given that you can filter the column using filter syntax: sql>select column from table where [column] strimps.filter ['query string']; diffs (147 lines): diff --git a/sql/backends/monet5/CMakeLists.txt b/sql/backends/monet5/CMakeLists.txt --- a/sql/backends/monet5/CMakeLists.txt +++ b/sql/backends/monet5/CMakeLists.txt @@ -78,6 +78,7 @@ target_sources(sql sql_round_impl.h sql_fround.c sql_fround_impl.h sql_orderidx.c sql_orderidx.h + sql_strimps.c sql_strimps.h sql_time.c sql_bincopyfrom.c wlr.c wlr.h diff --git a/sql/backends/monet5/sql.c b/sql/backends/monet5/sql.c --- a/sql/backends/monet5/sql.c +++ b/sql/backends/monet5/sql.c @@ -4929,6 +4929,7 @@ finalize: #include "sql_assert.h" #include "sql_execute.h" #include "sql_orderidx.h" +#include "sql_strimps.h" #include "sql_subquery.h" #include "sql_statistics.h" #include "sql_transaction.h" @@ -5048,6 +5049,7 @@ static mel_func sql_init_funcs[] = { pattern("sql", "storage", sql_storage, false, "return a table with storage information for a particular column", args(17,20, batarg("schema",str),batarg("table",str),batarg("column",str),batarg("type",str),batarg("mode",str),batarg("location",str),batarg("count",lng),batarg("atomwidth",int),batarg("columnsize",lng),batarg("heap",lng),batarg("hashes",lng),batarg("phash",bit),batarg("imprints",lng),batarg("sorted",bit),batarg("revsorted",bit),batarg("key",bit),batarg("orderidx",lng),arg("sname",str),arg("tname",str),arg("cname",str))), pattern("sql", "createorderindex", sql_createorderindex, true, "Instantiate the order index on a column", args(0,3, arg("sch",str),arg("tbl",str),arg("col",str))), pattern("sql", "droporderindex", sql_droporderindex, true, "Drop the order index on a column", args(0,3, arg("sch",str),arg("tbl",str),arg("col",str))), + pattern("sql", "createstrimps", sql_createstrimps, true, "Instantiate the strimps index on a column", args(0,3, arg("sch",str),arg("tbl",str),arg("col",str))), command("calc", "identity", SQLidentity, false, "Returns a unique row identitfier.", args(1,2, arg("",oid),argany("",0))), command("batcalc", "identity", BATSQLidentity, false, "Returns the unique row identitfiers.", args(1,2, batarg("",oid),batargany("b",0))), pattern("batcalc", "identity", PBATSQLidentity, false, "Returns the unique row identitfiers.", args(2,4, batarg("resb",oid),arg("ns",oid),batargany("b",0),arg("s",oid))), diff --git a/sql/backends/monet5/sql_strimps.c b/sql/backends/monet5/sql_strimps.c new file mode 100644 --- /dev/null +++ b/sql/backends/monet5/sql_strimps.c @@ -0,0 +1,73 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. + */ + +#include "monetdb_config.h" +#include "mal_backend.h" +#include "sql_strimps.h" + +static str +sql_load_bat(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci, BAT **b) +{ + mvc *m = NULL; + str msg = getSQLContext(cntxt, mb, , NULL); + str sch,tbl,col; + sql_schema *s; + sql_table *t; + sql_column *c; + + if (msg != MAL_SUCCEED || (msg = checkSQLContext(cntxt)) != NULL) + return msg; + + sch = *getArgReference_str(stk, pci, 1); + tbl = *getArgReference_str(stk, pci, 2); + col = *getArgReference_str(stk, pci, 3); + + if (!(s = mvc_bind_schema(m, sch))) + throw(SQL, "sql.createstrimps", SQLSTATE(3FOOO) "Unknown schema %s", sch); + + if (!mvc_schema_privs(m, s)) + throw(SQL, "sql.createstrimps", SQLSTATE(42000) "Access denied for %s to schema '%s'", + get_string_global_var(m, "current_user"), s->base.name); + if (!(t = mvc_bind_table(m, s, tbl)) || !isTable(t)) + throw(SQL, "sql.createstrimps", SQLSTATE(42S02) "Unknown table %s.%s", sch, tbl); + if (!(c = mvc_bind_column(m, t, col))) + throw(SQL, "sql.createstrimps", SQLSTATE(38000) "Unknown column %s.%s.%s", sch, tbl, col); + + sqlstore *store = m->session->tr->store; + *b = store->storage_api.bind_col(m->session->tr, c, 0); + if (*b == 0) + throw(SQL, "sql.createstrimps", SQLSTATE(HY005) "Cannot access column %s", col); + + return msg; + +} + +str +sql_createstrimps(Client cntxt, MalBlkPtr mb, MalStkPtr stk,
MonetDB: string_imprints - Extract a separate MAL module for str...
Changeset: 1bb96864a107 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/1bb96864a107 Added Files: monetdb5/modules/mal/strimps.c Modified Files: monetdb5/modules/mal/CMakeLists.txt monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Extract a separate MAL module for strimps diffs (261 lines): diff --git a/monetdb5/modules/mal/CMakeLists.txt b/monetdb5/modules/mal/CMakeLists.txt --- a/monetdb5/modules/mal/CMakeLists.txt +++ b/monetdb5/modules/mal/CMakeLists.txt @@ -43,6 +43,7 @@ target_sources(malmodules projectionpath.c tablet.c tablet.h batcalc.c calc.c + strimps.c PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/wlc.h) diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -339,69 +339,6 @@ PATstrimp_makehist(Client cntxt, MalBlkP return MAL_SUCCEED; } #endif -static str -PATstrimpCreate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) -{ - bat bid; - BAT *b; - (void)cntxt; - (void)mb; - - bid = *getArgReference_bat(stk, pci, 1); - if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - - if(STRMPcreate(b) != GDK_SUCCEED) - throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) OPERATION_FAILED); - - // *getArgReference_lng(stk, pci, 0) = 0; - return MAL_SUCCEED; -} - -static str -PATstrimpFilter(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { - (void)cntxt; - (void)mb; - (void)stk; - (void)pci; - throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) "UNIMPLEMENTED"); -} - -static str -PATstrimpFilterSelect(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) -{ - bat bid, sid; - BAT *b, *s, *ob; - str pat; - - (void)cntxt; - (void)mb; - - bid = *getArgReference_bat(stk, pci, 1); - if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - - sid = *getArgReference_bat(stk, pci, 2); - if ((s = BATdescriptor(sid)) == NULL) - throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - - assert(s->ttype == TYPE_void); - - if (!STRMPcreate(b)) { - throw(MAL, "bat.strimpfilter", SQLSTATE(HY002) OPERATION_FAILED); - } - - pat = *getArgReference_str(stk, pci, 3); - if ((ob = STRMPfilter(b, pat)) == NULL) { - BBPunfix(b->batCacheid); - throw(MAL, "bat.strimpfilter", SQLSTATE(HY002)); - } - - *getArgReference_bat(stk, pci, 0) = ob->batCacheid; - BBPkeepref(ob->batCacheid); - - return MAL_SUCCEED; -} #include "mel.h" mel_func batExtensions_init_funcs[] = { @@ -432,14 +369,6 @@ mel_func batExtensions_init_funcs[] = { #endif pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments ins to i", args(1,4, batargany("",1), batargany("i",1),arg("force",bit),varargany("ins",1))), pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments ins to i", args(1,4, batargany("",1), batargany("i",1),arg("force",bit),batvarargany("ins",1))), - - /* String imprints */ - // pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams in a string bat", args(1,2,arg("",lng),batarg("b",str))), - // pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), batarg("",lng),batarg("b",str))), - pattern("bat", "mkstrimp", PATstrimpCreate, false, "construct the strimp a BAT", args(1,2,arg("",void),batarg("b",str))), - pattern("bat", "strimpfilter", PATstrimpFilter, false, "", args(1,3,arg("",bit),arg("b",str),arg("q",str))), - pattern("bat", "strimpfilterselect", PATstrimpFilterSelect, false, "", args(1,5,batarg("",oid),batarg("b",str),batarg("s",oid),arg("q",str),arg("a",bit))), - pattern("bat", "strimpfilterjoin", PATstrimpFilter, false, "", args(2,8,batarg("",oid),batarg("b",str),arg("q",str))), { .imp=NULL } }; #include "mal_import.h" diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c new file mode 100644 --- /dev/null +++ b/monetdb5/modules/mal/strimps.c @@ -0,0 +1,157 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. + */ +#include "monetdb_config.h" +#include "mal_client.h" +#include "mal_interpreter.h" +#include "mal_exception.h" + +#if 0 +/* + * String imprints. + */ +static str +PATstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + bat bid; + BAT *b; + size_t n; + +
MonetDB: string_imprints - Attempt to make filtering mitosis-aware
Changeset: 4b4623152417 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/4b4623152417 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Attempt to make filtering mitosis-aware This needs still more work. diffs (59 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -455,18 +455,17 @@ STRMPfilter(BAT *b, char *q) BUN i; uint64_t qbmask; uint64_t *ptr; - int zz = 0; - if (b->tstrimps == NULL) goto sfilter_fail; - r = COLnew(0, TYPE_oid, b->batCount, TRANSIENT); + r = COLnew(b->hseqbase, TYPE_oid, b->batCount, TRANSIENT); if (r == NULL) { goto sfilter_fail; } if (!BATcheckstrimps(b)) { + BBPunfix(r->batCacheid); goto sfilter_fail; } qbmask = STRMPmakebitstring(q, b->tstrimps); @@ -478,18 +477,13 @@ STRMPfilter(BAT *b, char *q) if (BUNappend(r, , false) != GDK_SUCCEED) goto sfilter_fail; } - else { - zz++; - } } - printf("filtered out: %d entries\n", zz); r->tkey = true; return virtualize(r); sfilter_fail: - BBPunfix(r->batCacheid); return NULL; } @@ -579,9 +573,15 @@ STRMPcreate(BAT *b) assert(b->ttype == TYPE_str); TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + if (BATcheckstrimps(b)) return GDK_SUCCEED; + if (VIEWtparent(b)) { + assert(b->tstrimps == NULL); + b = BBPdescriptor(VIEWtparent(b)); + } + if ((h = STRMPcreateStrimpHeap(b)) == NULL) { return GDK_FAIL; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: 0f94f85f07bf for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/0f94f85f07bf Branch: string_imprints Log Message: Merge with default diffs (truncated from 889 to 300 lines): diff --git a/documentation/index.rst b/documentation/index.rst --- a/documentation/index.rst +++ b/documentation/index.rst @@ -16,6 +16,9 @@ Welcome to MonetDB's documentation! monetdbe/examples monetdbe/installation monetdbe/monetdbe_api + monetdbe/manual_pages/monetdbe_open + monetdbe/manual_pages/monetdbe_options + monetdbe/manual_pages/monetdbe_remote source/intro source/build source/build-fedora diff --git a/gdk/gdk_cand.h b/gdk/gdk_cand.h --- a/gdk/gdk_cand.h +++ b/gdk/gdk_cand.h @@ -177,6 +177,7 @@ gdk_export oid canditer_last(const struc gdk_export oid canditer_prev(struct canditer *ci); gdk_export oid canditer_peekprev(struct canditer *ci); gdk_export oid canditer_idx(const struct canditer *ci, BUN p); +#define canditer_idx_dense(ci, p) ((p >= (ci)->ncand)?oid_nil:((ci)->seq + p)) gdk_export void canditer_setidx(struct canditer *ci, BUN p); gdk_export void canditer_reset(struct canditer *ci); gdk_export BUN canditer_search(const struct canditer *ci, oid o, bool next); diff --git a/gdk/gdk_project.c b/gdk/gdk_project.c --- a/gdk/gdk_project.c +++ b/gdk/gdk_project.c @@ -20,6 +20,57 @@ * hseqbase + its batCount. */ +#define project1_loop(TYPE)\ +static gdk_return \ +project1_##TYPE(BAT *restrict bn, BAT *restrict l, BAT *restrict r1) \ +{ \ + BUN lo, hi; \ + const TYPE *restrict r1t; \ + TYPE *restrict bt; \ + oid r1seq, r1end; \ + \ + MT_thread_setalgorithm(__func__); \ + r1t = (const TYPE *) Tloc(r1, 0); \ + bt = (TYPE *) Tloc(bn, 0); \ + r1seq = r1->hseqbase; \ + r1end = r1seq + BATcount(r1); \ + if (BATtdense(l)) { \ + if (l->tseqbase < r1seq || \ + (l->tseqbase+BATcount(l)) >= r1end) {\ + GDKerror("does not match always\n");\ + return GDK_FAIL;\ + } \ + oid off = l->tseqbase - r1seq; \ + r1t += off; \ + for (lo = 0, hi = BATcount(l); lo < hi; lo++) \ + bt[lo] = r1t[lo]; \ + } else {\ + const oid *restrict ot = (const oid *) Tloc(l, 0); \ + for (lo = 0, hi = BATcount(l); lo < hi; lo++) { \ + oid o = ot[lo]; \ + if (o < r1seq || o >= r1end) { \ + GDKerror("does not match always\n");\ + return GDK_FAIL;\ + } \ + bt[lo] = r1t[o - r1seq];\ + } \ + } \ + BATsetcount(bn, lo);\ + return GDK_SUCCEED; \ +} + +/* project type switch */ +project1_loop(bte) +project1_loop(sht) +project1_loop(int) +project1_loop(flt) +project1_loop(dbl) +project1_loop(lng) +#ifdef HAVE_HGE +project1_loop(hge) +#endif +project1_loop(uuid) + #define project_loop(TYPE) \ static gdk_return \ project_##TYPE(BAT *restrict bn, BAT *restrict l, \ @@ -34,6 +85,8 @@ project_##TYPE(BAT *restrict bn, BAT *re oid r1seq, r1end; \ oid r2seq, r2end; \ \ + if ((!ci || ci->tpe == cand_dense) && l->tnonil && !r2) \ + return project1_##TYPE(bn, l, r1); \
MonetDB: string_imprints - Make naming more consistent
Changeset: 5493fe034571 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/5493fe034571 Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Make naming more consistent diffs (131 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -65,7 +65,7 @@ * contains. */ static size_t -GDKstrimp_strlen(const uint8_t *s) +STRMP_strlen(const uint8_t *s) { size_t ret = 0; size_t i; @@ -102,7 +102,7 @@ GDKstrimp_strlen(const uint8_t *s) * 1 digram starting at character n - 1 */ gdk_return -GDKstrimp_ndigrams(BAT *b, size_t *n) +STRMPndigrams(BAT *b, size_t *n) { // lng t0; BUN i; @@ -119,7 +119,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) *n = 0; for (i = 0; i < b->batCount; i++) { s = (char *)BUNtail(bi, i); -// *n += GDKstrimp_strlen(s) - 1; +// *n += STRMP_strlen(s) - 1; *n += strlen(s) - 1; // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s); } @@ -152,7 +152,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) * count. */ gdk_return -GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins) +STRMPmakehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins) { lng t0=0; size_t hi; @@ -269,7 +269,7 @@ create_header(BAT *b) if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL) return NULL; - if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) { + if(STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) { GDKfree(header); return NULL; } @@ -307,7 +307,7 @@ lookup_index(StrimpHeader *h, DataPair n * This should probably be inlined. */ static uint64_t -GDKstrimp_make_bitstring(const str s, StrimpHeader *h) +STRMPmakebitstring(const str s, StrimpHeader *h) { uint64_t ret = 0; int8_t pair_idx; @@ -379,7 +379,7 @@ create_strimp_heap(BAT *b, StrimpHeader /* Create */ gdk_return -GDKstrimp_create_strimp(BAT *b) +STRMPcreate(BAT *b) { lng t0 = 0; BATiter bi; @@ -401,13 +401,13 @@ GDKstrimp_create_strimp(BAT *b) GDKfree(head); return GDK_FAIL; } - dh = (uint64_t *)h->base + h->free; + dh = (uint64_t *)h->base + h->free; // That's probably not correct bi = bat_iterator(b); for (i = 0; i < b->batCount; i++) { s = (str)BUNtvar(bi, i); if (!strNil(s)) - *dh++ = GDKstrimp_make_bitstring(s, head); + *dh++ = STRMPmakebitstring(s, head); else *dh++ = 0; /* no pairs in nil values */ diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -27,9 +27,10 @@ typedef struct { DataPair bytepairs[STRIMP_HEADER_SIZE]; } StrimpHeader; -gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); // Remove? -gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins); // make static -// gdk_export gdk_return GDKstrimp_make_header(StrimpHeader *h, uint64_t *hist, size_t hist_size); // make static -//gdk_export gdk_return GDKstrimp_make_header(BAT *b); -gdk_export gdk_return GDKstrimp_create_strimp(BAT *b); +gdk_export gdk_return STRMPndigrams(BAT *b, size_t *n); // Remove? +gdk_export gdk_return STRMPmakehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins); // make static +// gdk_export gdk_return STRMP_make_header(StrimpHeader *h, uint64_t *hist, size_t hist_size); // make static +//gdk_export gdk_return STRMP_make_header(BAT *b); +gdk_export gdk_return STRMPcreate(BAT *b); +gdk_export BAT *STRMPfilter(BAT *b, char *q); #endif /* _GDK_STRIMPS_H_ */ diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -284,7 +284,7 @@ PATstrimp_ndigrams(Client cntxt, MalBlkP if ((b = BATdescriptor(bid)) == NULL) throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - if (!GDKstrimp_ndigrams(b, )) { + if (!STRMPndigrams(b, )) { throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) OPERATION_FAILED); } @@ -309,7 +309,7 @@ PATstrimp_makehist(Client cntxt, MalBlkP if ((b = BATdescriptor(bid)) == NULL) throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - if (!GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, )) { + if (!STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, )) {
MonetDB: string_imprints - Read persistent strimp [WIP]
Changeset: fa263cc6a470 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/fa263cc6a470 Modified Files: gdk/gdk.h gdk/gdk_private.h gdk/gdk_strimps.c gdk/gdk_strimps.h monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Read persistent strimp [WIP] diffs (truncated from 361 to 300 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -568,6 +568,7 @@ typedef struct { typedef struct Hash Hash; typedef struct Imprints Imprints; +typedef struct Strimps Strimps; /* * @+ Binary Association Tables @@ -732,7 +733,7 @@ typedef struct { Hash *hash; /* hash table */ Imprints *imprints; /* column imprints index */ Heap *orderidx; /* order oid index */ - Heap *strimps; /* string imprint index */ + Strimps *strimps; /* string imprint index */ PROPrec *props; /* list of dynamic properties stored in the bat descriptor */ } COLrec; diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -18,6 +18,9 @@ /* persist order index heaps for persistent BATs */ #define PERSISTENTIDX 1 +/* persist strimp heaps for persistent BATs */ +#define PERSISTENTSTRIMP 1 + #include "gdk_system_private.h" enum heaptype { @@ -384,6 +387,15 @@ struct Imprints { BUN dictcnt;/* counter for cache dictionary */ }; +struct Strimps { + Heap strimps; + void *offsets_base; /* pointer into strimps heap (pair offsets) */ + /* offsets_base is a pointer to either a uint8_t or a uint16_ */ + uint8_t *pairs_base;/* pointer into strimps heap (pairs start) */ + void *strimps_base; /* pointer into strimps heap (strimps start) */ + /* strimps_base is a pointer to either a uint32_t or a uint64_t */ +}; + typedef struct { MT_Lock swap; } batlock_t; diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -91,7 +91,6 @@ STRMP_strlen(const uint8_t *s) return ret; } -#endif /* Given a BAT return the number of digrams in it. The observation is * that the number of digrams is the number of characters - 1: @@ -129,6 +128,7 @@ STRMPndigrams(BAT *b, size_t *n) return GDK_SUCCEED; } +#endif /* The isIgnored is a bit suspect in terms of unicode. There are * non-ASCII codepoints that are considered spaces, for example the @@ -151,8 +151,8 @@ STRMPndigrams(BAT *b, size_t *n) * Return the histogram in hist and the number of non-zero bins in * count. */ -gdk_return -STRMPmakehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins) +static gdk_return +STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins) { lng t0=0; size_t hi; @@ -269,7 +269,7 @@ create_header(BAT *b) if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL) return NULL; - if(STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) { + if(STRMPmakehistogramBP(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) { GDKfree(header); return NULL; } @@ -324,11 +324,11 @@ STRMPmakebitstring(const str s, StrimpHe } /* Create the heap for a string imprint. Returns NULL on failure. */ -static Heap * -create_strimp_heap(BAT *b, StrimpHeader *h) +static Strimps * +create_strimp(BAT *b, StrimpHeader *h) { - Heap *r = NULL; uint64_t *d; + Strimps *r = NULL; uint64_t descriptor; uint64_t npairs, bytes_per_pair, hsize; size_t i; @@ -336,15 +336,15 @@ create_strimp_heap(BAT *b, StrimpHeader const char *nme; nme = GDKinmemory(b->theap->farmid) ? ":memory:" : BBP_physical(b->batCacheid); - if ((r = GDKzalloc(sizeof(Heap))) == NULL || - (r->farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 || - strconcat_len(r->filename, sizeof(r->filename), - nme, ".strimp", NULL) >= sizeof(r->filename) || - HEAPalloc(r, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t), 0) != GDK_SUCCEED) { + if ((r = GDKzalloc(sizeof(Strimps))) == NULL || + (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 || + strconcat_len(r->strimps.filename, sizeof(r->strimps.filename), + nme, ".strimp", NULL) >= sizeof(r->strimps.filename) || + HEAPalloc(>strimps, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t), 0) != GDK_SUCCEED) { GDKfree(r); return NULL; } - r->free = STRIMP_OFFSET * sizeof(uint64_t); + r->strimps.free = STRIMP_OFFSET * sizeof(uint64_t); npairs = STRIMP_HEADER_SIZE; bytes_per_pair = 2; /* Bytepair implementation */ @@ -353,9 +353,9 @@ create_strimp_heap(BAT *b, StrimpHeader
MonetDB: string_imprints - Update comment
Changeset: 0cc344ae7097 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/0cc344ae7097 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Update comment diffs (45 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -16,17 +16,33 @@ * - a 64 bit mask for each item in the BAT that encodes the presence or * absence of each element of the header in the specific item. * - * A string imprint is stored in a new Heap in the BAT. + * A string imprint is stored in a new Heap in the BAT, aligned in 8 + * byte (64 bit) words. * - * In the current (byte pair) implementation the first 136 bytes - * (i.e. the first 17 64 bit quantities) in the Heap are as follows: + * The first 64 bit word describes how the header of the strimp is + * encoded. The most significant byte (v in the schematic below) is the + * version number. The second (np) is the number of pairs in the + * header. The third (b/p) is the number of bytes per pair if each pair + * is encoded using a constant number of bytes or 0 if it is utf-8. The + * next 2 bytes (hs) is the size of the header in bytes. The last 3 + * bytes needed to align to the 8 byte boundary should be zero, and are + * reserved for future use. + * + * In the current implementation we use 64 byte pairs for the header, so * - * | Version Number | - - * | byte pair 01 | byte pair 02 | byte pair 03 | byte pair 04 | | - * | byte pair 05 | byte pair 06 | byte pair 07 | byte pair 08 | | 17 64 bit quantities - * [...] | - * | byte pair 61 | byte pair 62 | byte pair 63 | byte pair 64 | - + * np == 64 + * b/p == 2 + * hs == 128 + * + * The actual header follows. If it ends before an 8 byte boundary it + * is padded with zeros. * + * | v | np | b/p | hs | reserved | 8bytes + * | |--- + * Strimp Header | + * | | hs bytes + padding + * | | | + * | |--- * The bitmasks for each string in the BAT follow after this. * * Strimp creation goes as follows: ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Fix bitmask construction
Changeset: 14266938fcad for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/14266938fcad Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix bitmask construction The current bytepair in the string might not exist in the header. diffs (52 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -60,10 +60,10 @@ #include "gdk.h" #include "gdk_private.h" +#if 0 /* This counts how many unicode codepoints the given string * contains. */ -#if 0 static size_t GDKstrimp_strlen(const uint8_t *s) { @@ -251,7 +251,7 @@ make_header(StrimpHeader *h, uint64_t* h } for(i = 0; i < STRIMP_HEADER_SIZE; i++) { - TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); + TRC_DEBUG(ALGO, "0x%x 0x%x: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); } TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0); @@ -283,7 +283,7 @@ create_header(BAT *b) * * h[i] == p * - * Returns 0 if p is not in h. + * Returns -1 if p is not in h. * * TODO: Should this be inlined somehow? (probably yes) */ @@ -295,7 +295,7 @@ lookup_index(StrimpHeader *h, DataPair n if(h->bytepairs[i] == n) return i; - return 0; + return -1; } @@ -314,7 +314,8 @@ GDKstrimp_make_bitstring(const str s, St for(it = s; *it != 0 && *(it+1) != 0; it++) { pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); - ret |= 0x1 << pair_idx; + if (pair_idx >= 0) + ret |= 0x1 << pair_idx; } return ret; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Construct strimp descriptor correctly
Changeset: f46a719af133 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/f46a719af133 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Construct strimp descriptor correctly diffs (92 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -20,7 +20,7 @@ * byte (64 bit) words. * * The first 64 bit word describes how the header of the strimp is - * encoded. The most significant byte (v in the schematic below) is the + * encoded. The least significant byte (v in the schematic below) is the * version number. The second (np) is the number of pairs in the * header. The third (b/p) is the number of bytes per pair if each pair * is encoded using a constant number of bytes or 0 if it is utf-8. The @@ -205,22 +205,23 @@ GDKstrimp_make_histogram(BAT *b, uint64_ return GDK_SUCCEED; } -/* Given a histogram find the indices of the 64 largest counts. +/* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest + * counts. * * We make one scan of histogram and every time we find a count that is - * greater than the current minimum of the 64, we bubble it up in the - * header until we find a count that is greater. We carry the index in - * the histogram because this is the information we are actually - * interested in keeping. + * greater than the current minimum of the STRIMP_HEADER_SIZE, we bubble + * it up in the header until we find a count that is greater. We carry + * the index in the histogram because this is the information we are + * actually interested in keeping. * - * At the end of this process we have the indices of 64 largest counts - * in the histogram. This process is O(n) in time since we are doing - * constant work (at most 63 comparisons and swaps) for each item in the - * histogram and as such is (theoretically) more efficient than sorting - * (O(nlog n))and taking the 64 largest elements. This depends on the - * size of the histogram n. For some small n sorting might be more - * efficient, but for such inputs the difference should not be - * noticeable. + * At the end of this process we have the indices of STRIMP_HEADER_SIZE + * largest counts in the histogram. This process is O(n) in time since + * we are doing constant work (at most 63 comparisons and swaps) for + * each item in the histogram and as such is (theoretically) more + * efficient than sorting (O(nlog n))and taking the STRIMP_HEADER_SIZE + * largest elements. This depends on the size of the histogram n. For + * some small n sorting might be more efficient, but for such inputs the + * difference should not be noticeable. * * In the current implementation each index is a DataPair value that is * constructed by pairToIndex from 2 consecutive bytes in the input. @@ -328,6 +329,9 @@ create_strimp_heap(BAT *b, StrimpHeader { Heap *r = NULL; uint64_t *d; + uint64_t descriptor; + uint8_t npairs, bytes_per_pair; + uint16_t hsize; size_t i,j; const char *nme; @@ -342,7 +346,17 @@ create_strimp_heap(BAT *b, StrimpHeader } r->free = STRIMP_OFFSET * sizeof(uint64_t); + npairs = STRIMP_HEADER_SIZE; + bytes_per_pair = 2; /* Bytepair implementation */ + hsize = sizeof(h->bytepairs); + + assert(bytes_per_pair == 0 || npairs*bytes_per_pair == hsize); + + descriptor = 0; + descriptor = STRIMP_VERSION | npairs << 8 | bytes_per_pair << 16 | hsize << 24; + d = (uint64_t *)r->base; + *d++ = descriptor; /* This loop assumes that we are working with byte pairs * (i.e. the type of the header is uint16_t). TODO: generalize. */ @@ -352,7 +366,14 @@ create_strimp_heap(BAT *b, StrimpHeader *d <<= 16; *d |= h->bytepairs[i + j]; } + d++; } +#ifndef NDEBUG + FILE *fp = fopen("/tmp/foo.strimp", "wb"); + fwrite(r->base, sizeof(uint64_t), STRIMP_HEADER_SIZE/4 + 1, fp); + fclose(fp); +#endif + return r; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Take into account negative numbers
Changeset: d74bcfb2b926 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d74bcfb2b926 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Take into account negative numbers diffs (45 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -287,7 +287,7 @@ create_header(BAT *b) * * TODO: Should this be inlined somehow? (probably yes) */ -static uint8_t +static int8_t lookup_index(StrimpHeader *h, DataPair n) { size_t i; @@ -309,12 +309,13 @@ static uint64_t GDKstrimp_make_bitstring(const str s, StrimpHeader *h) { uint64_t ret = 0; - uint8_t pair_idx; + int8_t pair_idx; char *it; for(it = s; *it != 0 && *(it+1) != 0; it++) { pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); if (pair_idx >= 0) + assert(pair_idx < STRIMP_HEADER_SIZE); ret |= 0x1 << pair_idx; } @@ -323,7 +324,7 @@ GDKstrimp_make_bitstring(const str s, St /* Create the heap for a string imprint. Returns NULL on failure. */ static Heap * -createStrimpheap(BAT *b, StrimpHeader *h) +create_strimp_heap(BAT *b, StrimpHeader *h) { Heap *r = NULL; uint64_t *d; @@ -374,7 +375,7 @@ GDKstrimp_create_strimp(BAT *b) return GDK_FAIL; } - if ((h = createStrimpheap(b, head)) == NULL) { + if ((h = create_strimp_heap(b, head)) == NULL) { GDKfree(head); return GDK_FAIL; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge branch 'master' into branches/s...
Changeset: 3a4196c618de for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3a4196c618de Branch: string_imprints Log Message: Merge branch 'master' into branches/string_imprints diffs (truncated from 4430 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -777,7 +777,8 @@ stdout of test 'MAL-signatures` in direc [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:bat[:str], X_2:bat[:str], X_3:str):bat[:bit] ", "BATPCREnotlike;", "" ] [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:bat[:str], X_2:str, X_3:str):bat[:bit] ", "BATPCREnotlike;", "" ] [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:str, X_2:bat[:str], X_3:str):bat[:bit] ", "BATPCREnotlike;", "" ] -[ "batblob", "nitems", "command batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", "" ] +[ "batblob", "nitems", "pattern batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", "" ] +[ "batblob", "nitems", "pattern batblob.nitems(X_1:bat[:blob], X_2:bat[:oid]):bat[:int] ", "BLOBnitems_bulk;", "" ] [ "batcalc", "!=", "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1]):bat[:bit] ","CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1], X_3:bat[:oid]):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1], X_3:bat[:oid], X_4:bit):bat[:bit] ","CMDbatNE;","" ] @@ -8010,7 +8011,6 @@ stdout of test 'MAL-signatures` in direc [ "calc", "iszero", "pattern calc.iszero(X_1:int):bit ", "CMDvarISZERO;","" ] [ "calc", "iszero", "pattern calc.iszero(X_1:lng):bit ", "CMDvarISZERO;","" ] [ "calc", "iszero", "pattern calc.iszero(X_1:sht):bit ", "CMDvarISZERO;","" ] -[ "calc", "json", "command calc.json(X_1:json):json ", "JSONstr2json;","" ] [ "calc", "json", "command calc.json(X_1:str):json ", "JSONstr2json;","" ] [ "calc", "length", "command calc.length(X_1:str):int ", "CMDstrlength;","" ] [ "calc", "lng", "pattern calc.lng(X_1:bit):lng ", "CMDvarCONVERT;", "" ] @@ -8814,10 +8814,7 @@ stdout of test 'MAL-signatures` in direc [ "json", "fold", "pattern json.fold(X_1:bat[:str], X_2:bat[:any]):json ","JSONfold;","" ] [ "json", "integer", "command json.integer(X_1:json):lng ", "JSONjson2integer;","" ] [ "json", "isarray", "command json.isarray(X_1:json):bit ", "JSONisarray;", "" ] -[ "json", "isarray", "command json.isarray(X_1:str):bit ", "JSONisarray;", "" ] [ "json", "isobject", "command json.isobject(X_1:json):bit ", "JSONisobject;","" ] -[ "json", "isobject", "command json.isobject(X_1:str):bit ", "JSONisobject;","" ] -[ "json", "isvalid", "command json.isvalid(X_1:json):bit ", "JSONisvalid;", "" ] [ "json", "isvalid", "command json.isvalid(X_1:str):bit ", "JSONisvalid;", "" ] [ "json", "keyarray", "command json.keyarray(X_1:json):json ", "JSONkeyArray;","" ] [ "json", "keys", "command json.keys(X_1:json):bat[:str] ", "JSONkeyTable;","" ] diff --git a/clients/Tests/MAL-signatures.stable.out.int128 b/clients/Tests/MAL-signatures.stable.out.int128 --- a/clients/Tests/MAL-signatures.stable.out.int128 +++ b/clients/Tests/MAL-signatures.stable.out.int128 @@ -892,7 +892,8 @@ stdout of test 'MAL-signatures` in direc [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:bat[:str], X_2:bat[:str], X_3:str):bat[:bit] ", "BATPCREnotlike;", "" ] [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:bat[:str], X_2:str, X_3:str):bat[:bit] ", "BATPCREnotlike;", "" ] [ "batalgebra","not_like", "pattern batalgebra.not_like(X_1:str, X_2:bat[:str], X_3:str):bat[:bit] ", "BATPCREnotlike;", "" ] -[ "batblob", "nitems", "command batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", "" ] +[ "batblob", "nitems", "pattern batblob.nitems(X_1:bat[:blob]):bat[:int] ","BLOBnitems_bulk;", "" ] +[ "batblob", "nitems", "pattern batblob.nitems(X_1:bat[:blob], X_2:bat[:oid]):bat[:int] ", "BLOBnitems_bulk;", "" ] [ "batcalc", "!=", "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1]):bat[:bit] ","CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_1:any_1, X_2:bat[:any_1], X_3:bat[:oid]):bat[:bit] ",
MonetDB: string_imprints - Add documentation and move things arr...
Changeset: e695149dd3ce for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e695149dd3ce Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Add documentation and move things arround diffs (115 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -94,48 +94,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) ((TPE *) _a)[_j] = _t; \ } while(0) -static StrimpHeader * -make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size) -{ - lng t0 = 0; - size_t i; - uint64_t max_counts[STRIMP_SIZE] = {0}; - const size_t cmin_max = STRIMP_SIZE - 1; - size_t hidx; - - TRC_DEBUG_IF(ALGO) t0 = GDKusec(); - - for(i = 0; i < STRIMP_SIZE; i++) - h->bytepairs[i] = 0; - - for(i = 0; i < hist_size; i++) { - if (max_counts[cmin_max] < hist[i]) { - max_counts[cmin_max] = hist[i]; - h->bytepairs[cmin_max] = i; -for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { - swp(max_counts, hidx, hidx-1, uint64_t); - swp(h->bytepairs, hidx, hidx-1, uint16_t); - } - } - } - - for(i = 0; i < STRIMP_SIZE; i++) { - TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); - } - - TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0); - - return h; -} - - -/* static uint64_t */ -/* add_to_header(size_t idx, uint64_t count) */ -/* { */ -/* while */ -/* return GDK_SUCCEED; */ -/* } */ -/* Construct a histogram of pairs of bytes. +/* Construct a histogram of pairs of bytes in the input BAT. * * Return the histogram in hist and the number of non-zero bins in * count. @@ -194,8 +153,59 @@ GDKstrimp_make_histogram(BAT *b, uint64_ return GDK_SUCCEED; } -gdk_return -GDKstrimp_make_header(BAT *b) +/* Given a histogram find the indices of the 64 largest counts. + * + * We make one scan of histogram and every time we find a count that is + * greater than the current minimum of the 64, we bubble it up in the + * header until we find a count that is greater. We carry the index in + * the histogram because this is the information we are actually + * interested in keeping. + * + * At the end of this process we have the indices of 64 largest counts + * in the histogram. This process is O(n) in time since we are doing + * constant work (at most 63 comparisons and swaps) for each item in the + * histogram and as such is (theoretically) more efficient than sorting + * (O(nlog n))and taking the 64 largest elements. This depends on the + * size of the histogram n. For some small n sorting might be more + * efficient, but for such inputs the difference should not be + * noticeable. + * + * In the current implementation each index is a DataPair value that is + * constructed by pairToIndex from 2 consecutive bytes in the input. + */ +static StrimpHeader * +make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size) +{ + lng t0 = 0; + size_t i; + uint64_t max_counts[STRIMP_HEADER_SIZE] = {0}; + const size_t cmin_max = STRIMP_HEADER_SIZE - 1; + size_t hidx; + + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + + for(i = 0; i < STRIMP_HEADER_SIZE; i++) + h->bytepairs[i] = 0; + + for(i = 0; i < hist_size; i++) { + if (max_counts[cmin_max] < hist[i]) { + max_counts[cmin_max] = hist[i]; + h->bytepairs[cmin_max] = i; +for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { + swp(max_counts, hidx, hidx-1, uint64_t); + swp(h->bytepairs, hidx, hidx-1, DataPair); + } + } + } + + for(i = 0; i < STRIMP_HEADER_SIZE; i++) { + TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); + } + + TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0); + + return h; +} { uint64_t hist[STRIMP_HISTSIZE] = {0}; size_t nbins = 0; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Some utility functions
Changeset: a13846692aaa for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a13846692aaa Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Some utility functions - lookup the index of a pair in the header - construct a bitstring for a given string encoding the presence or absence of the pairs in the header These should probably be inlined. diffs (68 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -227,33 +227,43 @@ create_header(BAT *b) } -/* static uint8_t */ -/* lookup_index(StrimpHeader *h, uint16_t n) */ -/* { */ -/* size_t i; */ -/* for(i = 0; i < STRIMP_SIZE; i++) */ -/* if(h->bytepairs[i] == n) */ -/* return i; */ +/* Given a strimp h and a DataPair p, return the index i for which + * + * h[i] == p + * + * Returns 0 if p is not in h. + * + * TODO: Should this be inlined somehow? (probably yes) + */ +static uint8_t +lookup_index(StrimpHeader *h, DataPair n) +{ + size_t i; + for(i = 0; i < STRIMP_HEADER_SIZE; i++) + if(h->bytepairs[i] == n) + return i; -/* return 0; */ -/* } */ + return 0; +} /* Given a strimp header and a string compute the bitstring of which * digrams(byte pairs) are present in the string. The strimp header is a * map from digram(byte pair) to index in the strimp. + * + * This should probably be inlined. */ -/* static uint64_t */ -/* GDKstrimp_make_bitstring(str s, StrimpHeader *h) */ -/* { */ -/* uint64_t ret = 0; */ -/* uint8_t pair_idx; */ -/* char *it; */ +static uint64_t +GDKstrimp_make_bitstring(const str s, StrimpHeader *h) +{ + uint64_t ret = 0; + uint8_t pair_idx; + char *it; -/* for(it = s; *it != 0 && *(it+1) != 0; it++) { */ -/* pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); */ -/* ret |= 0x1 << pair_idx; */ -/* } */ + for(it = s; *it != 0 && *(it+1) != 0; it++) { + pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); + ret |= 0x1 << pair_idx; + } -/* return ret; */ -/* } */ + return ret; +} ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Expose strimp construction to MAL
Changeset: 532b3fb7b9ff for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=532b3fb7b9ff Modified Files: monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Expose strimp construction to MAL diffs (33 lines): diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -402,7 +402,7 @@ PATstrimp_makehist(Client cntxt, MalBlkP } static str -PATstrimp_makeheader(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +PATstrimp(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { bat bid; BAT *b; @@ -413,9 +413,10 @@ PATstrimp_makeheader(Client cntxt, MalBl if ((b = BATdescriptor(bid)) == NULL) throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - if(GDKstrimp_make_header(b) != GDK_SUCCEED) + if(GDKstrimp_create_strimp(b) != GDK_SUCCEED) throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) OPERATION_FAILED); + // *getArgReference_lng(stk, pci, 0) = 0; return MAL_SUCCEED; } @@ -452,7 +453,7 @@ mel_func batExtensions_init_funcs[] = { /* String imprints */ pattern("bat", "strimpNDigrams", PATstrimp_ndigrams, false, "count digrams in a string bat", args(1,2,arg("",lng),batarg("b",str))), pattern("bat", "strimpHistogram", PATstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), batarg("",lng),batarg("b",str))), - pattern("bat", "strimpHeader", PATstrimp_makeheader, false, "construct the strimp header from a BAT", args(1,2,arg("",void),batarg("b",str))), + pattern("bat", "strimp", PATstrimp, false, "construct the strimp a BAT", args(1,2,arg("",void),batarg("b",str))), { .imp=NULL } }; #include "mal_import.h" ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Add a wrapper that allocates space fo...
Changeset: e09bb9a38502 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e09bb9a38502 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Add a wrapper that allocates space for the header diffs (32 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -206,17 +206,24 @@ make_header(StrimpHeader *h, uint64_t* h return h; } + +static StrimpHeader * +create_header(BAT *b) { uint64_t hist[STRIMP_HISTSIZE] = {0}; size_t nbins = 0; - StrimpHeader header; + StrimpHeader *header; + if ((header = (StrimpHeader*)GDKmalloc(sizeof(StrimpHeader))) == NULL) + return NULL; + if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) { - return GDK_FAIL; + GDKfree(header); + return NULL; } - make_header(, hist, STRIMP_HISTSIZE); + make_header(header, hist, STRIMP_HISTSIZE); - return GDK_SUCCEED; + return header; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Add some documentation
Changeset: d0711db453cd for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d0711db453cd Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Add some documentation diffs (105 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -6,6 +6,40 @@ * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. */ + +/* A string imprint is an index that can be used as a prefilter in LIKE + * queries. It has 2 components: + * + * - a header of 64 string element pairs (bytes in the current + * implementation but maybe unicode chars might make more sense). + * + * - a 64 bit mask for each item in the BAT that encodes the presence or + * absence of each element of the header in the specific item. + * + * A string imprint is stored in a new Heap in the BAT. + * + * In the current (byte pair) implementation the first 136 bytes + * (i.e. the first 17 64 bit quantities) in the Heap are as follows: + * + * | Version Number | - + * | byte pair 01 | byte pair 02 | byte pair 03 | byte pair 04 | | + * | byte pair 05 | byte pair 06 | byte pair 07 | byte pair 08 | | 17 64 bit quantities + * [...] | + * | byte pair 61 | byte pair 62 | byte pair 63 | byte pair 64 | - + * + * The bitmasks for each string in the BAT follow after this. + * + * Strimp creation goes as follows: + * + * - Construct a histogram of the element (byte or character) pairs for + * all the strings in the BAT. + * + * - Take the 64 most frequent pairs as the Strimp Header. + * + * - For each string in the bat construct a 64 bit mask that encodes the + * presence or absence of each member of the header in the string. + */ + #include "monetdb_config.h" #include "gdk.h" #include "gdk_private.h" @@ -13,33 +47,35 @@ /* This counts how many unicode codepoints the given string * contains. */ -/* static size_t */ -/* GDKstrimp_strlen(const uint8_t *s) */ -/* { */ -/* size_t ret = 0; */ -/* size_t i; */ -/* int m,n; */ -/* uint8_t c; */ +#if 0 +static size_t +GDKstrimp_strlen(const uint8_t *s) +{ + size_t ret = 0; + size_t i; + int m,n; + uint8_t c; -/* i = 0; */ -/* while((c = *(s + i)) != 0) { */ -/* if (c < 0x80) */ -/* i++; */ -/* else { */ -/* for (n = 0, m=0x40; c & m; n++, m >>= 1) */ -/* ; */ -/* /\* n is now the number of 10xx bytes that should */ -/*follow. *\/ */ -/* if (n == 0 || n >= 4) */ -/* /\* TODO: handle invalid utf-8 *\/ */ -/* {} */ -/* i += n+1; */ -/* } */ -/* ret++; */ -/* } */ + i = 0; + while((c = *(s + i)) != 0) { + if (c < 0x80) + i++; + else { + for (n = 0, m=0x40; c & m; n++, m >>= 1) + ; + /* n is now the number of 10xx bytes that should + follow. */ + if (n == 0 || n >= 4) + /* TODO: handle invalid utf-8 */ + {} + i += n+1; + } + ret++; + } -/* return ret; */ -/* } */ + return ret; +} +#endif /* Given a BAT return the number of digrams in it. The observation is * that the number of digrams is the number of characters - 1: ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Count byte pairs instead of unicode c...
Changeset: fbcd6ce89476 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fbcd6ce89476 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Count byte pairs instead of unicode character pairs diffs (83 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -13,33 +13,33 @@ /* This counts how many unicode codepoints the given string * contains. */ -static size_t -GDKstrimp_strlen(const uint8_t *s) -{ - size_t ret = 0; - size_t i; - int m,n; - uint8_t c; +/* static size_t */ +/* GDKstrimp_strlen(const uint8_t *s) */ +/* { */ +/* size_t ret = 0; */ +/* size_t i; */ +/* int m,n; */ +/* uint8_t c; */ - i = 0; - while((c = *(s + i)) != 0) { - if (c < 0x80) - i++; - else { - for (n = 0, m=0x40; c & m; n++, m >>= 1) - ; - /* n is now the number of 10xx bytes that should - follow. */ - if (n == 0 || n >= 4) - /* TODO: handle invalid utf-8 */ - {} - i += n+1; - } - ret++; - } +/* i = 0; */ +/* while((c = *(s + i)) != 0) { */ +/* if (c < 0x80) */ +/* i++; */ +/* else { */ +/* for (n = 0, m=0x40; c & m; n++, m >>= 1) */ +/* ; */ +/* /\* n is now the number of 10xx bytes that should */ +/*follow. *\/ */ +/* if (n == 0 || n >= 4) */ +/* /\* TODO: handle invalid utf-8 *\/ */ +/* {} */ +/* i += n+1; */ +/* } */ +/* ret++; */ +/* } */ - return ret; -} +/* return ret; */ +/* } */ /* Given a BAT return the number of digrams in it. The observation is * that the number of digrams is the number of characters - 1: @@ -55,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) // lng t0; BUN i; BATiter bi; - uint8_t *s; + char *s; // GDKtracer_set_component_level("ALGO", "DEBUG"); // struct canditer ci; @@ -66,8 +66,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) bi = bat_iterator(b); *n = 0; for (i = 0; i < b->batCount; i++) { - s = (uint8_t *)BUNtail(bi, i); -*n += GDKstrimp_strlen(s) - 1; + s = (char *)BUNtail(bi, i); +// *n += GDKstrimp_strlen(s) - 1; + *n += strlen(s) - 1; // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s); } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Get the correct argument from the MAL...
Changeset: 0d8e5444d101 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0d8e5444d101 Modified Files: monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Get the correct argument from the MAL stack diffs (12 lines): diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -409,7 +409,7 @@ PATstrimp_makeheader(Client cntxt, MalBl (void)cntxt; (void)mb; - bid = *getArgReference_bat(stk, pci, 2); + bid = *getArgReference_bat(stk, pci, 1); if ((b = BATdescriptor(bid)) == NULL) throw(MAL, "bat.strimpHeader", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Small changes
Changeset: 1ef057324896 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1ef057324896 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Small changes diffs (21 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -84,7 +84,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) */ #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) #define isNotIgnored(x) (!isIgnored(x)) -#define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2)) +#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b1)<<8 | ((uint8_t)b2)) #define indexToPair1(idx) (idx & 0xff00) >> 8 #define indexToPair2(idx) (idx & 0xff) #define swp(_a, _i, _j, TPE) \ @@ -148,7 +148,7 @@ GDKstrimp_make_histogram(BAT *b, uint64_ } } - TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0); + TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0); GDKtracer_flush_buffer(); return GDK_SUCCEED; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Functions to construct the string imp...
Changeset: 6ab7ac7f1321 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6ab7ac7f1321 Modified Files: gdk/gdk.h gdk/gdk_private.h gdk/gdk_strimps.c gdk/gdk_strimps.h Branch: string_imprints Log Message: Functions to construct the string imprint for a given BAT diffs (153 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -701,6 +701,7 @@ typedef struct { Hash *hash; /* hash table */ Imprints *imprints; /* column imprints index */ Heap *orderidx; /* order oid index */ + Heap *strimps; /* string imprint index */ PROPrec *props; /* list of dynamic properties stored in the bat descriptor */ } COLrec; @@ -772,6 +773,7 @@ typedef struct BATiter { #define thash T.hash #define timprints T.imprints #define tprops T.props +#define tstrimps T.strimps diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -25,7 +25,8 @@ enum heaptype { varheap, hashheap, imprintsheap, - orderidxheap + orderidxheap, + strimpheap }; #ifdef GDKLIBRARY_OLDDATE diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -267,3 +267,85 @@ GDKstrimp_make_bitstring(const str s, St return ret; } + +/* Create the heap for a string imprint. Returns NULL on failure. */ +static Heap * +createStrimpheap(BAT *b, StrimpHeader *h) +{ + Heap *r = NULL; + uint64_t *d; + size_t i,j; + const char *nme; + + nme = GDKinmemory(b->theap.farmid) ? ":memory:" : BBP_physical(b->batCacheid); + if ((r = GDKzalloc(sizeof(Heap))) == NULL || + (r->farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 || + strconcat_len(r->filename, sizeof(r->filename), + nme, ".strimp", NULL) >= sizeof(r->filename) || + HEAPalloc(r, BATcount(b) + STRIMP_OFFSET, sizeof(uint64_t)) != GDK_SUCCEED) { + GDKfree(r); + return NULL; + } + r->free = STRIMP_OFFSET * sizeof(uint64_t); + + d = (uint64_t *)r->base; + /* This loop assumes that we are working with byte pairs +* (i.e. the type of the header is uint16_t). TODO: generalize. +*/ + for(i = 0; i < STRIMP_HEADER_SIZE; i += 4) { + *d = 0; + for(j = 0; j < 4; j++) { + *d <<= 16; + *d |= h->bytepairs[i + j]; + } + } + return r; +} + +/* Create */ +gdk_return +GDKstrimp_create_strimp(BAT *b) +{ + lng t0 = 0; + BATiter bi; + BUN i; + str s; + StrimpHeader *head; + Heap *h; + uint64_t *dh; + + assert(b->ttype == TYPE_str); + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + + if ((head = create_header(b)) == NULL) { + return GDK_FAIL; + } + + if ((h = createStrimpheap(b, head)) == NULL) { + GDKfree(head); + return GDK_FAIL; + } + dh = (uint64_t *)h->base + h->free; + + bi = bat_iterator(b); + for (i = 0; i < b->batCount; i++) { + s = (str)BUNtvar(bi, i); + if (!strNil(s)) + *dh++ = GDKstrimp_make_bitstring(s, head); + else + *dh++ = 0; /* no pairs in nil values */ + + } + + /* After we have computed the strimp, attempt to write it back +* to the BAT. +*/ + MT_lock_set(>batIdxLock); + b->tstrimps = h; + b->batDirtydesc = true; + /* persistStrimp(b) */ + MT_lock_unset(>batIdxLock); + + TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0); + return GDK_SUCCEED; +} diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -11,19 +11,25 @@ #include + +#define STRIMP_VERSION (uint64_t)1 /* Count the occurences of pairs of bytes. This is a compromise between * just handling ASCII and full UTF-8 support. */ #define STRIMP_HISTSIZE 256*256 -#define STRIMP_SIZE 64 +#define STRIMP_HEADER_SIZE 64 +#define STRIMP_OFFSET 1 + STRIMP_HEADER_SIZE*sizeof(DataPair)/sizeof(uint64_t) /* version + header */ + +typedef uint16_t DataPair; typedef struct { // TODO: find a better name for this - uint16_t bytepairs[STRIMP_SIZE]; + DataPair bytepairs[STRIMP_HEADER_SIZE]; } StrimpHeader; gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); // Remove? gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins); // make static // gdk_export gdk_return GDKstrimp_make_header(StrimpHeader *h, uint64_t *hist, size_t hist_size); // make static -gdk_export gdk_return GDKstrimp_make_header(BAT *b); +//gdk_export gdk_return
MonetDB: string_imprints - Handle ignored bytes correctly
Changeset: e752aa525361 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e752aa525361 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Handle ignored bytes correctly diffs (35 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -93,7 +93,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count) { - lng t0; + lng t0=0; size_t hi; BUN i; BATiter bi; @@ -111,7 +111,21 @@ GDKstrimp_makehistogram(BAT *b, uint64_t s = (char *)BUNtvar(bi, i); if (!strNil(s)) { for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) { - if (isNotIgnored(*ptr) && isNotIgnored(*(ptr+1))) { + if (isIgnored(*(ptr+1))) { + /* Skip this and the next pair +* if the next char is ignored. +*/ + ptr++; + } + else if (isIgnored(*ptr)) { + /* Skip this pair if the current +* char is ignored. This should +* only happen at the beginnig +* of a string. +*/ + ; + } + else { hi = pairToIndex(*(ptr), *(ptr+1)); assert(hi < hist_size); if (hist[hi] == 0) ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - First implementation of strimp header...
Changeset: 57ba6f8b90aa for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=57ba6f8b90aa Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: First implementation of strimp header contruction diffs (212 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -84,7 +84,56 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) #define isNotIgnored(x) (!isIgnored(x)) #define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2)) +#define indexToPair1(idx) (idx & 0xff00) >> 8 +#define indexToPair2(idx) (idx & 0xff) +#define swp(_a, _i, _j, TPE) \ + do {\ + TPE _t = ((TPE *)_a)[_i]; \ + ((TPE *) _a)[_i] = ((TPE *) _a)[_j];\ + ((TPE *) _a)[_j] = _t; \ + } while(0) +static StrimpHeader * +make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size) +{ + lng t0 = 0; + size_t i; + uint64_t max_counts[STRIMP_SIZE] = {0}; + const size_t cmin_max = STRIMP_SIZE - 1; + size_t hidx; + + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + + for(i = 0; i < STRIMP_SIZE; i++) + h->bytepairs[i] = 0; + + for(i = 0; i < hist_size; i++) { + if (max_counts[cmin_max] < hist[i]) { + max_counts[cmin_max] = hist[i]; + h->bytepairs[cmin_max] = i; +for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { + swp(max_counts, hidx, hidx-1, uint64_t); + swp(h->bytepairs, hidx, hidx-1, uint16_t); + } + } + } + + for(i = 0; i < STRIMP_SIZE; i++) { + TRC_DEBUG(ALGO, "%u %u: %lu", indexToPair1(h->bytepairs[i]), indexToPair2(h->bytepairs[i]), max_counts[i]); + } + + TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0); + + return h; +} + + +/* static uint64_t */ +/* add_to_header(size_t idx, uint64_t count) */ +/* { */ +/* while */ +/* return GDK_SUCCEED; */ +/* } */ /* Construct a histogram of pairs of bytes. * * Return the histogram in hist and the number of non-zero bins in @@ -98,6 +147,7 @@ GDKstrimp_make_histogram(BAT *b, uint64_ BUN i; BATiter bi; char *ptr, *s; + /* uint64_t cur_min = 0; */ TRC_DEBUG_IF(ALGO) t0 = GDKusec(); assert(b->ttype == TYPE_str); @@ -131,12 +181,61 @@ GDKstrimp_make_histogram(BAT *b, uint64_ if (hist[hi] == 0) (*nbins)++; hist[hi]++; + /* if (hist[hi] > cur_min) */ + /* cur_min = add_to_header(hi, hist[hi]); */ } } } } - TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + TRC_DEBUG_ENDIF(ALGO, LLFMT "usec\n", GDKusec() - t0); GDKtracer_flush_buffer(); return GDK_SUCCEED; } + +gdk_return +GDKstrimp_make_header(BAT *b) +{ + uint64_t hist[STRIMP_HISTSIZE] = {0}; + size_t nbins = 0; + StrimpHeader header; + if(GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, ) != GDK_SUCCEED) { + return GDK_FAIL; + } + + make_header(, hist, STRIMP_HISTSIZE); + + return GDK_SUCCEED; +} + + +/* static uint8_t */ +/* lookup_index(StrimpHeader *h, uint16_t n) */ +/* { */ +/* size_t i; */ +/* for(i = 0; i < STRIMP_SIZE; i++) */ +/* if(h->bytepairs[i] == n) */ +/* return i; */ + +/* return 0; */ +/* } */ + + +/* Given a strimp header and a string compute the bitstring of which + * digrams(byte pairs) are present in the string. The strimp header is a + * map from digram(byte pair) to index in the strimp. + */ +/* static uint64_t */ +/* GDKstrimp_make_bitstring(str s, StrimpHeader *h) */ +/* { */ +/* uint64_t ret = 0; */ +/* uint8_t pair_idx; */ +/* char *it; */ + +/* for(it = s; *it != 0 && *(it+1) != 0; it++) { */ +/* pair_idx = lookup_index(h, pairToIndex(*it, *(it+1))); */ +/* ret |= 0x1 << pair_idx; */ +/* } */ + +/* return ret; */ +/* } */ diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -15,8 +15,15 @@ * just handling ASCII and full UTF-8 support. */ #define STRIMP_HISTSIZE 256*256 - -gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); +#define STRIMP_SIZE 64 -gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins); +typedef struct { +
MonetDB: string_imprints - Basic correct implementation
Changeset: 2e4b7358231f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2e4b7358231f Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Basic correct implementation Do not miss anything, do not allow ignored characters. diffs (64 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -81,16 +81,25 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) * non-ASCII codepoints that are considered spaces, for example the * codepoints in the range U+2000-U+200f. */ -#define isIgnored(x) isspace((x)) || isdigit((x)) -#define pairToIndex(b1, b2) (b1)<<8 | (b2) +#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) +#define isNotIgnored(x) (!isIgnored(x)) +#define pairToIndex(b1, b2) (((uint8_t)b1)<<8 | ((uint8_t)b2)) +/* Construct a histogram of pairs of bytes. + * + * Return the histogram in hist and the number of non-zero bins in + * count. + */ gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count) { + lng t0; size_t hi; BUN i; BATiter bi; - uint8_t *ptr, *s; + char *ptr, *s; + + TRC_DEBUG_IF(ALGO) t0 = GDKusec(); assert(b->ttype == TYPE_str); for(hi = 0; hi < hist_size; hi++) @@ -99,18 +108,21 @@ GDKstrimp_makehistogram(BAT *b, uint64_t bi = bat_iterator(b); *count = 0; for(i = 0; i < b->batCount; i++) { - s = (uint8_t *)BUNtail(bi, i); - for(ptr = s; *(ptr + 1) != 0; ptr++) { - if (isIgnored(*ptr)) /* skip the current pair and the next at the end of the loop */ - ptr++; - else { - hi = pairToIndex(*(ptr), *(ptr+1)); - assert(hi < hist_size); - if (hist[hi] == 0) - (*count)++; - hist[hi]++; + s = (char *)BUNtvar(bi, i); + if (!strNil(s)) { + for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) { + if (isNotIgnored(*ptr) && isNotIgnored(*(ptr+1))) { + hi = pairToIndex(*(ptr), *(ptr+1)); + assert(hi < hist_size); + if (hist[hi] == 0) + (*count)++; + hist[hi]++; + } } } } + + TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + GDKtracer_flush_buffer(); return GDK_SUCCEED; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Improve names
Changeset: 950d5acff27f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=950d5acff27f Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Improve names diffs (53 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -91,7 +91,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) * count. */ gdk_return -GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count) +GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins) { lng t0=0; size_t hi; @@ -106,7 +106,7 @@ GDKstrimp_makehistogram(BAT *b, uint64_t hist[hi] = 0; bi = bat_iterator(b); - *count = 0; + *nbins = 0; for(i = 0; i < b->batCount; i++) { s = (char *)BUNtvar(bi, i); if (!strNil(s)) { @@ -129,7 +129,7 @@ GDKstrimp_makehistogram(BAT *b, uint64_t hi = pairToIndex(*(ptr), *(ptr+1)); assert(hi < hist_size); if (hist[hi] == 0) - (*count)++; + (*nbins)++; hist[hi]++; } } diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -17,6 +17,6 @@ #define STRIMP_HISTSIZE 256*256 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); -gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count); +gdk_export gdk_return GDKstrimp_make_histogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins); #endif /* _GDK_STRIMPS_H_ */ diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -380,7 +380,7 @@ CMDstrimp_makehist(Client cntxt, MalBlkP if ((b = BATdescriptor(bid)) == NULL) throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - if (!GDKstrimp_makehistogram(b, hist, STRIMP_HISTSIZE, )) { + if (!GDKstrimp_make_histogram(b, hist, STRIMP_HISTSIZE, )) { throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) OPERATION_FAILED); } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Count the total number of digrams in ...
Changeset: fabfd34343c3 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fabfd34343c3 Added Files: gdk/gdk_strimps.c gdk/gdk_strimps.h Modified Files: gdk/CMakeLists.txt gdk/gdk.h monetdb5/modules/mal/01_calc.mal monetdb5/modules/mal/batcalc.c Branch: string_imprints Log Message: Count the total number of digrams in a string bat diffs (190 lines): diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt --- a/gdk/CMakeLists.txt +++ b/gdk/CMakeLists.txt @@ -78,6 +78,7 @@ target_sources(bat gdk_analytic_func.c gdk_analytic.h gdk_tracer.c gdk_tracer.h + gdk_strimps.c gdk_strimps.h PUBLIC ${gdk_public_headers}) diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2113,4 +2113,9 @@ gdk_export BAT *BATsample_with_seed(BAT */ #define MAXPARAMS 32 +/* + * String Imprints Development/Testing. TODO: remove the following. + */ + +#include "gdk_strimps.h" #endif /* _GDK_H_ */ diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c new file mode 100644 --- /dev/null +++ b/gdk/gdk_strimps.c @@ -0,0 +1,55 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. + */ + +#include "monetdb_config.h" +#include "gdk.h" +#include "gdk_private.h" + +/* This counts how many unicode codepoints the given string + * contains. + */ +static size_t +GDKstrimp_strlen(const char *s) +{ + return strlen(s); +} + +/* Given a BAT return the number of digrams in it. The observation is + * that the number of digrams is the number of characters - 1: + * + * 1 digram starting at character 1 + * 1 digram starting at character 2 + * [...] + * 1 digram starting at character n - 1 + */ +gdk_return +GDKstrimp_ndigrams(BAT *b, size_t *n) +{ + // lng t0; + BUN i; + BATiter bi; + char *s; + // GDKtracer_set_component_level("ALGO", "DEBUG"); + // struct canditer ci; + + // t0 = GDKusec(); + // BATcheck(b, NULL); + assert(b->ttype == TYPE_str); + + bi = bat_iterator(b); + *n = 0; + for (i = 0; i < b->batCount; i++) { + s = (char *)BUNtail(bi, i); +*n += GDKstrimp_strlen(s) - 1; + // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, (char *)BUNtail(bi, i)); + } + + // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + + return GDK_SUCCEED; +} diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h new file mode 100644 --- /dev/null +++ b/gdk/gdk_strimps.h @@ -0,0 +1,27 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. + */ + +#ifndef _GDK_STRIMPS_H_ +#define _GDK_STRIMPS_H_ + +#include + +#define HISTSIZE 64 + +typedef struct { + uint64_t counts[HISTSIZE]; + char foo; +} Histogram; + +typedef struct { + Histogram* hist; +} Strimp; + +gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); + +#endif /* _GDK_STRIMPS_H_ */ diff --git a/monetdb5/modules/mal/01_calc.mal b/monetdb5/modules/mal/01_calc.mal --- a/monetdb5/modules/mal/01_calc.mal +++ b/monetdb5/modules/mal/01_calc.mal @@ -5593,3 +5593,9 @@ comment "Calculate aggregate string conc pattern str_group_concat(b:bat[:str],sep:bat[:str],s:bat[:oid],nil_if_empty:bit) :str address CMDBATstr_group_concat comment "Calculate aggregate string concatenate of B with candidate list and separator SEP."; + + +# foo +pattern str_iterate_bat(b:bat[:str]) :void; +address CMDstr_iterate_bat +comment "iterate through a bat"; diff --git a/monetdb5/modules/mal/batcalc.c b/monetdb5/modules/mal/batcalc.c --- a/monetdb5/modules/mal/batcalc.c +++ b/monetdb5/modules/mal/batcalc.c @@ -1368,6 +1368,39 @@ CMDifthen(Client cntxt, MalBlkPtr mb, Ma return MAL_SUCCEED; } + +/* + * String imprints dev/testing. TODO: remove. + */ +static str +CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + bat bid; + BAT *b; + size_t n; + + (void)cntxt; + (void)mb; + + // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED); + bid = *getArgReference_bat(stk, pci, 1); + if ((b = BATdescriptor(bid)) == NULL) + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + + if (!GDKstrimp_ndigrams(b, )) { + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); + } + + *getArgReference_lng(stk, pci, 0) = n; + + return MAL_SUCCEED; +} + + +/* + * String imprints dev/testing. TODO: end remove. + */ + #include "mel.h" static str @@ -2187,7 +2220,17 @@ static mel_func
MonetDB: string_imprints - Merge branch 'master' into branches/s...
Changeset: c5599a533197 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c5599a533197 Branch: string_imprints Log Message: Merge branch 'master' into branches/string_imprints diffs (truncated from 13685 to 300 lines): diff --git a/ctest/tools/monetdbe/CMakeLists.txt b/ctest/tools/monetdbe/CMakeLists.txt --- a/ctest/tools/monetdbe/CMakeLists.txt +++ b/ctest/tools/monetdbe/CMakeLists.txt @@ -20,12 +20,6 @@ target_link_libraries(example2 monetdbe) add_test(run_example2 example2) -add_executable(example_proxy example_proxy.c) -target_link_libraries(example_proxy - PRIVATE -monetdb_config_header -monetdbe) - add_executable(example_temporal example_temporal.c) target_link_libraries(example_temporal PRIVATE @@ -77,12 +71,12 @@ target_link_libraries(example_connection monetdbe) add_test(run_example_connections example_connections) -add_executable(example_remote example_remote.c) -target_link_libraries(example_remote +add_executable(example_proxy example_proxy.c) +target_link_libraries(example_proxy PRIVATE monetdb_config_header monetdbe) -add_test(run_example_remote example_remote) +add_test(run_example_proxy example_proxy) if(WITH_CMOCKA) add_executable(cmocka_test cmocka_test.c test_helper.c) @@ -95,3 +89,9 @@ if(WITH_CMOCKA) ) add_test(run_cmocka_test cmocka_test) endif() + +if (TESTING) + install(TARGETS +example_proxy +DESTINATION ${CMAKE_INSTALL_BINDIR}) +endif() diff --git a/ctest/tools/monetdbe/Tests/All b/ctest/tools/monetdbe/Tests/All new file mode 100644 --- /dev/null +++ b/ctest/tools/monetdbe/Tests/All @@ -0,0 +1,1 @@ +example_proxy diff --git a/ctest/tools/monetdbe/Tests/example_proxy.SQL.py b/ctest/tools/monetdbe/Tests/example_proxy.SQL.py new file mode 100644 --- /dev/null +++ b/ctest/tools/monetdbe/Tests/example_proxy.SQL.py @@ -0,0 +1,45 @@ +import os, pymonetdb +import subprocess + +db = os.getenv("TSTDB") +port = os.getenv("MAPIPORT") + +client1 = pymonetdb.connect(database=db, port=port, autocommit=True) +cur1 = client1.cursor() +cur1.execute(''' +CREATE TABLE test (x INTEGER, y STRING); +INSERT INTO test VALUES (42, 'Hello'), (NULL, 'World'); +''') + +cur1.close() +client1.close() + +cmd = ['example_proxy', port, db] +results = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding='utf-8') + +if results.stderr: +print(results.stderr) + +lines = results.stdout.splitlines() + +if len(lines) != 3: +print(results.stdout) +print("Too many output lines.") +exit(1) + +def test_equal(expected, received): +if received != expected: +print("expected:") +print(expected) +print("received:") +print(received) +exit(1) + +expected="Query result with 2 cols and 2 rows" +test_equal(expected, lines[0]) + +expected="42, Hello" +test_equal(expected, lines[1]) + +expected="NULL, World" +test_equal(expected, lines[2]) diff --git a/ctest/tools/monetdbe/example_proxy.c b/ctest/tools/monetdbe/example_proxy.c --- a/ctest/tools/monetdbe/example_proxy.c +++ b/ctest/tools/monetdbe/example_proxy.c @@ -11,63 +11,34 @@ #include #include +#define expected_error(msg) {fprintf(stderr, "Failure: %s\n", msg); return 0;} #define error(msg) {fprintf(stderr, "Failure: %s\n", msg); return -1;} int -main(void) +main(int argc, char** argv) { + (void) argc; char* err = NULL; monetdbe_database mdbe = NULL; monetdbe_result* result = NULL; - -monetdbe_remote remote = { -.host = "127.0.0.1", -.port = 50001, -.username="monetdb", -.password="monetdb", -.lang="sql"}; - -monetdbe_options opt = {.remote = }; + assert(argc==3); + const int port = strtol(argv[1], NULL, 10); + const char* database = argv[2]; + monetdbe_remote remote = {.host="localhost", .port=port, .database=database, .username="monetdb", .password="monetdb"}; + monetdbe_options opts = {.remote = }; - // second argument is a string for the db directory or NULL for in-memory mode - if (monetdbe_open(, "mapi:monetdb://127.0.0.1:5?database=devdb", )) - error("Failed to open database") - - - if ((err = monetdbe_query(mdbe, "DELETE FROM test WHERE x < 0; ", , NULL)) != NULL) + if (monetdbe_open(, NULL, )) + expected_error("Failed to open database") + if ((err = monetdbe_query(mdbe, "SELECT x, y FROM test ORDER BY y ASC; ", , NULL)) != NULL) error(err) - if ((err = monetdbe_query(mdbe, "SELECT * FROM test; ", , NULL)) != NULL) - error(err) - - monetdbe_column* appendable_columns[2]; - fprintf(stdout, "Query result with %zu cols and %"PRId64" rows\n", result->ncols, result->nrows); for (int64_t r = 0; r < result->nrows; r++) { for (size_t c = 0; c < result->ncols; c++) { monetdbe_column* rcol; if ((err
MonetDB: string_imprints - Move MAL code to proper module
Changeset: f6360c814cda for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f6360c814cda Modified Files: monetdb5/modules/mal/batExtensions.c monetdb5/modules/mal/batcalc.c Branch: string_imprints Log Message: Move MAL code to proper module diffs (155 lines): diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -337,6 +337,75 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt return MAL_SUCCEED; } +/* + * String imprints dev/testing. TODO: remove. + */ +static str +CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + bat bid; + BAT *b; + size_t n; + + (void)cntxt; + (void)mb; + + // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED); + bid = *getArgReference_bat(stk, pci, 1); + if ((b = BATdescriptor(bid)) == NULL) + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + + if (!GDKstrimp_ndigrams(b, )) { + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); + } + + *getArgReference_lng(stk, pci, 0) = n; + + return MAL_SUCCEED; +} + +static str +CMDstrimp_makehist(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + bat bid; + BAT *b, *ob; + size_t i; + uint64_t hist[STRIMP_HISTSIZE]; + uint16_t count; + + (void)cntxt; + (void)mb; + + bid = *getArgReference_bat(stk, pci, 2); + if ((b = BATdescriptor(bid)) == NULL) + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + + if (!GDKstrimp_makehistogram(b, hist, STRIMP_HISTSIZE, )) { + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); + } + + ob = COLnew(0, TYPE_lng, STRIMP_HISTSIZE, TRANSIENT); + if (ob == NULL) { + throw(MAL, "strimp.makehist", SQLSTATE(HY013) MAL_MALLOC_FAIL); + } + + for (i=0; i < STRIMP_HISTSIZE; i++) { + if (BUNappend(ob, hist + i, false) != GDK_SUCCEED) + throw(MAL, "strimp.makehist", SQLSTATE(HY013) MAL_MALLOC_FAIL); + } + + *getArgReference_bat(stk, pci, 0) = count; + // *getArgReference_bat(stk, pci, 1) = ob->batCacheid; + + // BBPkeepref(ob->batCacheid); + return MAL_SUCCEED; +} + +/* + * String imprints dev/testing. TODO: end remove. + */ + + #include "mel.h" mel_func batExtensions_init_funcs[] = { pattern("bat", "new", CMDBATnew, false, "", args(1,2, batargany("",1),argany("tt",1))), @@ -365,6 +434,17 @@ mel_func batExtensions_init_funcs[] = { #endif pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments ins to i", args(1,4, batargany("",1), batargany("i",1),arg("force",bit),varargany("ins",1))), pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments ins to i", args(1,4, batargany("",1), batargany("i",1),arg("force",bit),batvarargany("ins",1))), + + /* + * String imprints dev/testing. TODO: remove. + */ + pattern("bat", "count_digrams", CMDstrimp_ndigrams, false, "count digrams in a string bat", args(1, 2, arg("",lng), batarg("b", str))), + //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", lng), batarg("b", str))), + pattern("bat", "make_histogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(1, 2, arg("", sht), batarg("b", str))), + /* + * String imprints dev/testing. TODO: end remove. + */ + { .imp=NULL } }; #include "mal_import.h" diff --git a/monetdb5/modules/mal/batcalc.c b/monetdb5/modules/mal/batcalc.c --- a/monetdb5/modules/mal/batcalc.c +++ b/monetdb5/modules/mal/batcalc.c @@ -1368,39 +1368,6 @@ CMDifthen(Client cntxt, MalBlkPtr mb, Ma return MAL_SUCCEED; } - -/* - * String imprints dev/testing. TODO: remove. - */ -static str -CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) -{ - bat bid; - BAT *b; - size_t n; - - (void)cntxt; - (void)mb; - - // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED); - bid = *getArgReference_bat(stk, pci, 1); - if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - - if (!GDKstrimp_ndigrams(b, )) { - throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); - } - - *getArgReference_lng(stk, pci, 0) = n; - - return MAL_SUCCEED; -} - - -/* - * String imprints dev/testing. TODO: end remove. - */ - #include "mel.h" static str @@ -2221,14 +2188,6 @@ static mel_func batcalc_init_funcs[] = { pattern("batcalc", "ifthenelse", CMDifthen, false, "If-then-else operation to assemble a conditional result",
MonetDB: string_imprints - Return the histogram itself along wit...
Changeset: 31582eece4b6 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=31582eece4b6 Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Return the histogram itself along with the non-zero bin count diffs (58 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -85,7 +85,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) #define pairToIndex(b1, b2) (b1)<<8 | (b2) gdk_return -GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t *count) +GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count) { size_t hi; BUN i; diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -17,6 +17,6 @@ #define STRIMP_HISTSIZE 256*256 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); -gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t *count); +gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, size_t *count); #endif /* _GDK_STRIMPS_H_ */ diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -371,7 +371,7 @@ CMDstrimp_makehist(Client cntxt, MalBlkP BAT *b, *ob; size_t i; uint64_t hist[STRIMP_HISTSIZE]; - uint16_t count; + size_t count; (void)cntxt; (void)mb; @@ -394,10 +394,10 @@ CMDstrimp_makehist(Client cntxt, MalBlkP throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) MAL_MALLOC_FAIL); } - *getArgReference_bat(stk, pci, 0) = count; - // *getArgReference_bat(stk, pci, 1) = ob->batCacheid; + *getArgReference_lng(stk, pci, 0) = count; + *getArgReference_bat(stk, pci, 1) = ob->batCacheid; - // BBPkeepref(ob->batCacheid); + BBPkeepref(ob->batCacheid); return MAL_SUCCEED; } @@ -433,7 +433,7 @@ mel_func batExtensions_init_funcs[] = { /* String imprints */ pattern("bat", "strimpNDigrams", CMDstrimp_ndigrams, false, "count digrams in a string bat", args(1,2,arg("",lng),batarg("b",str))), - pattern("bat", "strimpHistogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(1,2,arg("",lng), batarg("b",str))), + pattern("bat", "strimpHistogram", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2,3,arg("",lng), batarg("",lng),batarg("b",str))), //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", lng), batarg("b", str))), { .imp=NULL } }; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Count utf-8 chars correctly
Changeset: f0e19e88af26 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f0e19e88af26 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Count utf-8 chars correctly diffs (62 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -14,9 +14,31 @@ * contains. */ static size_t -GDKstrimp_strlen(const char *s) +GDKstrimp_strlen(const uint8_t *s) { - return strlen(s); + size_t ret = 0; + size_t i; + int m,n; + uint8_t c; + + i = 0; + while((c = *(s + i)) != 0) { + if (c < 0x80) + i++; + else { + for (n = 0, m=0x40; c & m; n++, m >>= 1) + ; + /* n is now the number of 10xx bytes that should + follow. */ + if (n == 0 || n >= 4) + /* TODO: handle invalid utf-8 */ + {} + i += n+1; + } + ret++; + } + + return ret; } /* Given a BAT return the number of digrams in it. The observation is @@ -33,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) // lng t0; BUN i; BATiter bi; - char *s; + uint8_t *s; // GDKtracer_set_component_level("ALGO", "DEBUG"); // struct canditer ci; @@ -44,12 +66,13 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) bi = bat_iterator(b); *n = 0; for (i = 0; i < b->batCount; i++) { - s = (char *)BUNtail(bi, i); + s = (uint8_t *)BUNtail(bi, i); *n += GDKstrimp_strlen(s) - 1; - // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, (char *)BUNtail(bi, i)); + // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s); } // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + // GDKtracer_flush_buffer(); return GDK_SUCCEED; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Rename functions for consistency
Changeset: f686c3ba196f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f686c3ba196f Modified Files: monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Rename functions for consistency diffs (81 lines): diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -338,7 +338,7 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt } /* - * String imprints dev/testing. TODO: remove. + * String imprints. */ static str CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) @@ -353,10 +353,10 @@ CMDstrimp_ndigrams(Client cntxt, MalBlkP // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED); bid = *getArgReference_bat(stk, pci, 1); if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); if (!GDKstrimp_ndigrams(b, )) { - throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); + throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) OPERATION_FAILED); } *getArgReference_lng(stk, pci, 0) = n; @@ -378,20 +378,20 @@ CMDstrimp_makehist(Client cntxt, MalBlkP bid = *getArgReference_bat(stk, pci, 2); if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); if (!GDKstrimp_makehistogram(b, hist, STRIMP_HISTSIZE, )) { - throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); + throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) OPERATION_FAILED); } ob = COLnew(0, TYPE_lng, STRIMP_HISTSIZE, TRANSIENT); if (ob == NULL) { - throw(MAL, "strimp.makehist", SQLSTATE(HY013) MAL_MALLOC_FAIL); + throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) MAL_MALLOC_FAIL); } for (i=0; i < STRIMP_HISTSIZE; i++) { if (BUNappend(ob, hist + i, false) != GDK_SUCCEED) - throw(MAL, "strimp.makehist", SQLSTATE(HY013) MAL_MALLOC_FAIL); + throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) MAL_MALLOC_FAIL); } *getArgReference_bat(stk, pci, 0) = count; @@ -401,10 +401,6 @@ CMDstrimp_makehist(Client cntxt, MalBlkP return MAL_SUCCEED; } -/* - * String imprints dev/testing. TODO: end remove. - */ - #include "mel.h" mel_func batExtensions_init_funcs[] = { @@ -435,16 +431,10 @@ mel_func batExtensions_init_funcs[] = { pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments ins to i", args(1,4, batargany("",1), batargany("i",1),arg("force",bit),varargany("ins",1))), pattern("bat", "appendBulk", CMDBATappend_bulk, false, "append the arguments ins to i", args(1,4, batargany("",1), batargany("i",1),arg("force",bit),batvarargany("ins",1))), - /* - * String imprints dev/testing. TODO: remove. - */ - pattern("bat", "count_digrams", CMDstrimp_ndigrams, false, "count digrams in a string bat", args(1, 2, arg("",lng), batarg("b", str))), + /* String imprints */ + pattern("bat", "strimpNDigrams", CMDstrimp_ndigrams, false, "count digrams in a string bat", args(1,2,arg("",lng),batarg("b",str))), + pattern("bat", "strimpHistogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(1,2,arg("",lng), batarg("b",str))), //pattern("batcalc", "make_histogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(2, 3, arg("", sht), batarg("", lng), batarg("b", str))), - pattern("bat", "make_histogam", CMDstrimp_makehist, false, "make a histogram of all the byte pairs in a BAT", args(1, 2, arg("", sht), batarg("b", str))), - /* - * String imprints dev/testing. TODO: end remove. - */ - { .imp=NULL } }; #include "mal_import.h" ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Byte pair histogram construction
Changeset: 4f3cbb1ef6c7 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f3cbb1ef6c7 Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h Branch: string_imprints Log Message: Byte pair histogram construction Count the occurrences of pairs of bytes. This is different than counting pairs of characters, unless the characters are ASCII. diffs (71 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -76,3 +76,41 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) return GDK_SUCCEED; } + +/* The isIgnored is a bit suspect in terms of unicode. There are + * non-ASCII codepoints that are considered spaces, for example the + * codepoints in the range U+2000-U+200f. + */ +#define isIgnored(x) isspace((x)) || isdigit((x)) +#define pairToIndex(b1, b2) (b1)<<8 | (b2) + +gdk_return +GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t *count) +{ + size_t hi; + BUN i; + BATiter bi; + uint8_t *ptr, *s; + assert(b->ttype == TYPE_str); + + for(hi = 0; hi < hist_size; hi++) + hist[hi] = 0; + + bi = bat_iterator(b); + *count = 0; + for(i = 0; i < b->batCount; i++) { + s = (uint8_t *)BUNtail(bi, i); + for(ptr = s; *(ptr + 1) != 0; ptr++) { + if (isIgnored(*ptr)) /* skip the current pair and the next at the end of the loop */ + ptr++; + else { + hi = pairToIndex(*(ptr), *(ptr+1)); + assert(hi < hist_size); + if (hist[hi] == 0) + (*count)++; + hist[hi]++; + } + } + } + return GDK_SUCCEED; +} diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -11,17 +11,12 @@ #include -#define HISTSIZE 64 - -typedef struct { - uint64_t counts[HISTSIZE]; - char foo; -} Histogram; - -typedef struct { - Histogram* hist; -} Strimp; +/* Count the occurences of pairs of bytes. This is a compromise between + * just handling ASCII and full UTF-8 support. + */ +#define STRIMP_HISTSIZE 256*256 gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); +gdk_export gdk_return GDKstrimp_makehistogram(BAT *b, uint64_t *hist, size_t hist_size, uint16_t *count); #endif /* _GDK_STRIMPS_H_ */ ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: 0ce20141e77a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/0ce20141e77a Modified Files: gdk/CMakeLists.txt gdk/gdk.h gdk/gdk_private.h gdk/gdk_strimps.c monetdb5/modules/mal/01_calc.mal monetdb5/modules/mal/batExtensions.c monetdb5/modules/mal/batcalc.c Branch: string_imprints Log Message: Merge with default diffs (truncated from 418839 to 300 lines): diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -11,9 +11,9 @@ assignees: '' A clear and concise description of what the bug is. **To Reproduce** -Create a setting with minimal input for an external user to demonstrate him the buggy behavior. -This includes the relevant part of the database schema description. -Performance trace of the roque query (using the TRACE command) +Create a setting with minimal input for an external user to demonstrate the buggy behavior. +This includes the relevant part of the database schema description. +Performance trace of the rogue query (using the TRACE command) **Expected behavior** A clear and concise description of what you expected to happen. diff --git a/.hgtags b/.hgtags --- a/.hgtags +++ b/.hgtags @@ -798,3 +798,5 @@ 929f5e280bc1532a2bfaab127ca7915dc3b69a33 742b7847cfdcea39a6c19ab29eb35471d46bb2bb Oct2020_SP2_release 17d27ad30941c81e4bc700300912e84e9b9a8c37 Oct2020_13 17d27ad30941c81e4bc700300912e84e9b9a8c37 Oct2020_SP3_release +6b71a8cc3498561815ac88d6c652922359efd13a Oct2020_15 +6b71a8cc3498561815ac88d6c652922359efd13a Oct2020_SP4_release diff --git a/MonetDB.spec b/MonetDB.spec --- a/MonetDB.spec +++ b/MonetDB.spec @@ -84,7 +84,7 @@ Group: Applications/Databases License: MPLv2.0 URL: https://www.monetdb.org/ BugURL: https://bugs.monetdb.org/ -Source: https://www.monetdb.org/downloads/sources/Oct2020-SP3/%{name}-%{version}.tar.bz2 +Source: https://www.monetdb.org/downloads/sources/Oct2020-SP4/%{name}-%{version}.tar.bz2 # The Fedora packaging document says we need systemd-rpm-macros for # the _unitdir and _tmpfilesdir macros to exist; however on RHEL 7 @@ -302,8 +302,8 @@ This package contains the files needed t Summary: MonetDB ODBC driver Group: Applications/Databases Requires: %{name}-client%{?_isa} = %{version}-%{release} -Requires(post): unixODBC -Requires(postun): unixODBC +Requires(post): %{_bindir}/odbcinst +Requires(postun): %{_bindir}/odbcinst %description client-odbc MonetDB is a database management system that is developed from a @@ -828,6 +828,48 @@ else fi %changelog +* Fri Apr 02 2021 Sjoerd Mullender - 11.39.15-20210402 +- Rebuilt. +- GH#6786: function json.isvalid(js json) is not useful, could be removed +- GH#7016: Database crashes when use similarity function on a table with + more than 200k records +- GH#7037: Clearer err msg for ALTER USER with insufficient privileges +- GH#7042: AddressSanitizer:DEADLYSIGNAL in Oct2020/gdk/gdk_tracer.c:494 +- GH#7050: file descriptor leak when forward=redirect +- GH#7057: ODBC driver installer on Windows is missing some DLLs +- GH#7058: MonetDBe: COPY INTO csv file does not produce any output +- GH#7059: MonetDBe: 'reverse' C UDF crashes +- GH#7061: Have bulk load support combined gzip files +- GH#7064: Temporary hashes created in hash and unique logic should try to + use transient data farm first +- GH#7066: percent_rank function with wrong results +- GH#7070: double free error when running MonetDBe Example +- GH#7076: mserver5 ignores memory.low from cgroups v2 +- GH#7077: Oct2020: new default privileges not effectively communicated +- GH#7083: MonetDBe C++ Compiling Error +- GH#7085: Mitosis and filter functions +- GH#7087: SIGSEGV caused by error in subquery's function being ignored by + top-level query +- GH#7089: Data consistency problem of query results in the latest release + of Monetdb (Remote Table) + +* Wed Mar 31 2021 Sjoerd Mullender - 11.39.15-20210402 +- odbc: When connecting using a DSN (Data Source Name), information about the + data source is retrieved from the ODBC.INI file. Now we also get the + location of the LOGFILE from this file. The logfile can be used to + log all calls to the MonetDB ODBC driver to a file which can be used + for debugging. + +* Thu Mar 25 2021 Sjoerd Mullender - 11.39.15-20210402 +- odbc: The ODBC driver now only passes on information about HUGEINT columns + as HUGEINT when the application has indicated interest by querying + about the SQL_HUGEINT extension type using the SQLGetTypeInfo + function or by specifying the type in a call to SQLSetDescField. + Otherwise the driver silently translates the HUGEINT type to BIGINT. + This means that most application will see BIGINT columns when the + server produced a HUGEINT column and only give an error if the value + in the HUGEINT column didn't fit into a BIGINT. + * Thu Feb 11 2021 Sjoerd Mullender -
MonetDB: string_imprints - Fix header construction
Changeset: e332f5015f9c for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/e332f5015f9c Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix header construction diffs (38 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -136,9 +136,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) */ #define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x))) #define isNotIgnored(x) (!isIgnored(x)) -#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b1)<<8 | ((uint8_t)b2)) -#define indexToPair1(idx) (idx & 0xff00) >> 8 -#define indexToPair2(idx) (idx & 0xff) +#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b2)<<8 | ((uint8_t)b1)) +#define indexToPair2(idx) (idx & 0xff00) >> 8 +#define indexToPair1(idx) (idx & 0xff) #define swp(_a, _i, _j, TPE) \ do {\ TPE _t = ((TPE *)_a)[_i]; \ @@ -330,9 +330,9 @@ create_strimp_heap(BAT *b, StrimpHeader Heap *r = NULL; uint64_t *d; uint64_t descriptor; - uint8_t npairs, bytes_per_pair; - uint16_t hsize; - size_t i,j; + uint64_t npairs, bytes_per_pair, hsize; + size_t i; + int j; const char *nme; nme = GDKinmemory(b->theap->farmid) ? ":memory:" : BBP_physical(b->batCacheid); @@ -362,7 +362,7 @@ create_strimp_heap(BAT *b, StrimpHeader */ for(i = 0; i < STRIMP_HEADER_SIZE; i += 4) { *d = 0; - for(j = 0; j < 4; j++) { + for(j = 3; j >= 0; j--) { *d <<= 16; *d |= h->bytepairs[i + j]; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: default - Add lock github bot configuration
Changeset: 6fc05424ec63 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/6fc05424ec63 Added Files: .github/lock.yml Branch: default Log Message: Add lock github bot configuration diffs (43 lines): diff --git a/.github/lock.yml b/.github/lock.yml new file mode 100644 --- /dev/null +++ b/.github/lock.yml @@ -0,0 +1,38 @@ +# Configuration for Lock Threads - https://github.com/dessant/lock-threads-app + +# Number of days of inactivity before a closed issue or pull request is locked +daysUntilLock: 20 + +# Skip issues and pull requests created before a given timestamp. Timestamp must +# follow ISO 8601 (`-MM-DD`). Set to `false` to disable +skipCreatedBefore: false + +# Issues and pull requests with these labels will be ignored. Set to `[]` to disable +exemptLabels: [] + +# Label to add before locking, such as `outdated`. Set to `false` to disable +lockLabel: false + +# Comment to post before locking. Set to `false` to disable +lockComment: > + This thread has been automatically locked since there has not been + any recent activity after it was closed. Please open a new issue for + related bugs. + +# Assign `resolved` as the reason for locking. Set to `false` to disable +setLockReason: false + +# Limit to only `issues` or `pulls` +# only: issues + +# Optionally, specify configuration settings just for `issues` or `pulls` +# issues: +# exemptLabels: +# - help-wanted +# lockLabel: outdated + +# pulls: +# daysUntilLock: 30 + +# Repository to extend settings from +# _extends: repo ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Do not try to generate strimp if it's...
Changeset: 76f731444d7c for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/76f731444d7c Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Do not try to generate strimp if it's already constructed diffs (65 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -392,35 +392,37 @@ GDKstrimp_create_strimp(BAT *b) assert(b->ttype == TYPE_str); TRC_DEBUG_IF(ALGO) t0 = GDKusec(); - if ((head = create_header(b)) == NULL) { - return GDK_FAIL; - } + if (b->tstrimps == NULL) { + if ((head = create_header(b)) == NULL) { + return GDK_FAIL; + } - if ((h = create_strimp_heap(b, head)) == NULL) { - GDKfree(head); - return GDK_FAIL; - } - dh = (uint64_t *)h->base + h->free; + if ((h = create_strimp_heap(b, head)) == NULL) { + GDKfree(head); + return GDK_FAIL; + } + dh = (uint64_t *)h->base + h->free; - bi = bat_iterator(b); - for (i = 0; i < b->batCount; i++) { - s = (str)BUNtvar(bi, i); - if (!strNil(s)) - *dh++ = GDKstrimp_make_bitstring(s, head); - else - *dh++ = 0; /* no pairs in nil values */ + bi = bat_iterator(b); + for (i = 0; i < b->batCount; i++) { + s = (str)BUNtvar(bi, i); + if (!strNil(s)) + *dh++ = GDKstrimp_make_bitstring(s, head); + else + *dh++ = 0; /* no pairs in nil values */ + } + + /* After we have computed the strimp, attempt to write it back +* to the BAT. +*/ + MT_lock_set(>batIdxLock); + b->tstrimps = h; + b->batDirtydesc = true; + /* persistStrimp(b) */ + MT_lock_unset(>batIdxLock); } - /* After we have computed the strimp, attempt to write it back -* to the BAT. -*/ - MT_lock_set(>batIdxLock); - b->tstrimps = h; - b->batDirtydesc = true; - /* persistStrimp(b) */ - MT_lock_unset(>batIdxLock); - TRC_DEBUG(ALGO, "strimp creation took " LLFMT " usec\n", GDKusec()-t0); return GDK_SUCCEED; } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: default - Show more information if Mz.py is not able to...
Changeset: 8088c952798c for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/8088c952798c Modified Files: testing/Mz.py.in Branch: default Log Message: Show more information if Mz.py is not able to start the server diffs (14 lines): diff --git a/testing/Mz.py.in b/testing/Mz.py.in --- a/testing/Mz.py.in +++ b/testing/Mz.py.in @@ -900,8 +900,8 @@ def GetBitsAndModsAndThreads(env) : if proc.returncode is None: killProc(proc, proc.stderr, cmd) proc.wait() -if procdebug: -print('GetBitsAndModsAndThreads: process exited "%s" (%s)\n' % ('" "'.join(cmd), proc.returncode)) +if procdebug or proc.returncode != 0: +print('GetBitsAndModsAndThreads: process exited "%s" (%s)\n' % ('" "'.join(cmd), proc.returncode), file=sys.stderr) env['TST_MODS'] = [] env['TST_BITS'] = "" env['TST_INT128'] = "" ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Create string imprints branch
Changeset: 6e9b8a1f0fc8 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6e9b8a1f0fc8 Branch: string_imprints Log Message: Create string imprints branch ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: default - Fix JSON parsing of exponents
Changeset: 11b6539611d7 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=11b6539611d7 Modified Files: monetdb5/modules/atoms/json.c Branch: default Log Message: Fix JSON parsing of exponents + is an acceptable char in an exponent, but a digit needs to be present. diffs (35 lines): diff --git a/monetdb5/modules/atoms/json.c b/monetdb5/modules/atoms/json.c --- a/monetdb5/modules/atoms/json.c +++ b/monetdb5/modules/atoms/json.c @@ -918,17 +918,29 @@ JSONfractionParser(const char *j, const static bool JSONexponentParser(const char *j, const char **next) { + const char *s = j; + bool saw_digit = false; + if (*j != 'e' && *j != 'E') { return false; } j++; - if (*j == '-') + if (*j == '-' || *j == '+') j++; - for (; *j; j++) + for (; *j; j++) { if (!isdigit((unsigned char)*j)) break; + saw_digit = true; + } + + + if (!saw_digit) { + j = s; + return false; + } + *next = j; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: default - Make json.isvalid a no-op for json inputs
Changeset: c02fd4fc3853 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c02fd4fc3853 Modified Files: monetdb5/modules/atoms/json.c monetdb5/modules/atoms/json.mal sql/scripts/40_json.sql Branch: default Log Message: Make json.isvalid a no-op for json inputs Also remove some other unneeded functions. This fixes #6786 diffs (101 lines): diff --git a/monetdb5/modules/atoms/json.c b/monetdb5/modules/atoms/json.c --- a/monetdb5/modules/atoms/json.c +++ b/monetdb5/modules/atoms/json.c @@ -400,7 +400,7 @@ JSONstr2json(json *ret, str *j) } static str -JSONisvalid(bit *ret, json *j) +JSONisvalid(bit *ret, str *j) { if (strNil(*j)) { *ret = bit_nil; @@ -2708,7 +2708,6 @@ static mel_atom json_init_atoms[] = { static mel_func json_init_funcs[] = { command("json", "new", JSONstr2json, false, "Convert string to its JSON. Dealing with escape characters", args(1,2, arg("",json),arg("j",str))), command("calc", "json", JSONstr2json, false, "Convert string to its JSON. Dealing with escape characters", args(1,2, arg("",json),arg("j",str))), - command("calc", "json", JSONstr2json, false, "Convert JSON to JSON. Dealing with escape characters", args(1,2, arg("",json),arg("j",json))), command("json", "str", JSONjson2str, false, "Convert JSON to its string equivalent. Dealing with escape characters", args(1,2, arg("",str),arg("j",json))), command("json", "text", JSONjson2text, false, "Convert JSON values to their plain string equivalent.", args(1,2, arg("",str),arg("j",json))), command("json", "text", JSONjson2textSeparator, false, "Convert JSON values to their plain string equivalent, injecting a separator.", args(1,3, arg("",str),arg("j",json),arg("s",str))), @@ -2728,12 +2727,9 @@ static mel_func json_init_funcs[] = { command("json", "filter", JSONfilterArray_hge, false, "", args(1,3, arg("",json),arg("name",json),arg("idx",hge))), command("json", "filter", JSONfilterArrayDefault_hge, false, "Extract a single array element", args(1,4, arg("",json),arg("name",json),arg("idx",hge),arg("other",str))), #endif - command("json", "isvalid", JSONisvalid, false, "Validate the string as a valid JSON document", args(1,2, arg("",bit),arg("val",json))), command("json", "isobject", JSONisobject, false, "Validate the string as a valid JSON object", args(1,2, arg("",bit),arg("val",json))), command("json", "isarray", JSONisarray, false, "Validate the string as a valid JSON array", args(1,2, arg("",bit),arg("val",json))), command("json", "isvalid", JSONisvalid, false, "Validate the string as a valid JSON document", args(1,2, arg("",bit),arg("val",str))), - command("json", "isobject", JSONisobject, false, "Validate the string as a valid JSON object", args(1,2, arg("",bit),arg("val",str))), - command("json", "isarray", JSONisarray, false, "Validate the string as a valid JSON array", args(1,2, arg("",bit),arg("val",str))), command("json", "length", JSONlength, false, "Returns the number of elements in the outermost JSON object.", args(1,2, arg("",int),arg("val",json))), pattern("json", "unfold", JSONunfold, false, "Expands the outermost JSON object into key-value pairs.", args(2,3, batarg("k",str),batarg("v",json),arg("val",json))), pattern("json", "unfold", JSONunfold, false, "Expands the outermost JSON object into key-value pairs.", args(3,4, batarg("o",oid),batarg("k",str),batarg("v",json),arg("val",json))), diff --git a/monetdb5/modules/atoms/json.mal b/monetdb5/modules/atoms/json.mal --- a/monetdb5/modules/atoms/json.mal +++ b/monetdb5/modules/atoms/json.mal @@ -22,10 +22,6 @@ command calc.json(j:str):json address JSONstr2json comment "Convert string to its JSON. Dealing with escape characters"; -command calc.json(j:json):json -address JSONstr2json -comment "Convert JSON to JSON. Dealing with escape characters"; - command str(j:json):str address JSONjson2str comment "Convert JSON to its string equivalent. Dealing with escape characters"; @@ -72,10 +68,6 @@ command filter(name:json, idx:lng, other address JSONfilterArrayDefault_lng comment "Extract a single array element"; -command isvalid(val:json):bit -address JSONisvalid -comment "Validate the string as a valid JSON document"; - command isobject(val:json):bit address JSONisobject comment "Validate the string as a valid JSON object"; @@ -88,14 +80,6 @@ command isvalid(val:str):bit address JSONisvalid comment "Validate the string as a valid JSON document"; -command isobject(val:str):bit -address JSONisobject -comment "Validate the string as a valid JSON object"; - -command isarray(val:str):bit -address JSONisarray -comment "Validate the string as a valid JSON array"; - command length(val:json):int address JSONlength comment "Returns the number of elements in the outermost JSON object."; diff --git a/sql/scripts/40_json.sql b/sql/scripts/40_json.sql --- a/sql/scripts/40_json.sql +++ b/sql/scripts/40_json.sql @@ -40,22 +40,13 @@ create
MonetDB: default - Ugly hack to fix monetdbe compilation for doc...
Changeset: 119fcfed7edd for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=119fcfed7edd Modified Files: common/utils/mcrypt.c Branch: default Log Message: Ugly hack to fix monetdbe compilation for docker pipeline diffs (22 lines): diff --git a/common/utils/mcrypt.c b/common/utils/mcrypt.c --- a/common/utils/mcrypt.c +++ b/common/utils/mcrypt.c @@ -45,6 +45,18 @@ mcrypt_getHashAlgorithms(void) * desire. */ static const char *algorithms = + /* When compiling MonetDBe for docker, we use -DWITH_CRYPTO=OFF. This means that none of the hashing algorithms +* are available and so we get a syntax error at mcrypt_getHashAlgorithms. +* +* This used to compile because it +* unconditionally included PROT10. + +* This hack is dangerous because it will allow MonetDB server to be built even without openssl installed. A +* sever like that will be incompatible with all clients because it does not implement MAPI correctly. Ideally +* we should solve this at CMake level but it is difficult because the common modules between MonetDBe and +* MonetDB server require substantially different compilation parameters. +*/ + "INVALID" #ifdef HAVE_RIPEMD160_UPDATE ",RIPEMD160" #endif ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Close the iterators
Changeset: 34549818041a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/34549818041a Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Close the iterators diffs (68 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -306,6 +306,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair } } } + bat_iterator_end(); // Choose the header pairs STRMPchoosePairs(hist, hlen, hpairs); @@ -496,6 +497,7 @@ STRMPfilter(BAT *b, char *q) return NULL; } +#if 0 static void BATstrimpsync(void *arg) { @@ -552,7 +554,6 @@ BATstrimpsync(void *arg) static void persistStrimp(BAT *b) { - TRC_DEBUG(ACCELERATOR, "zoo: %d\n", (BBP_status(b->batCacheid) & BBPEXISTING)); if((BBP_status(b->batCacheid) & BBPEXISTING) && b->batInserted == b->batCount && !b->theap->dirty @@ -567,6 +568,7 @@ persistStrimp(BAT *b) } else TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT persisting strimp\n", ALGOBATPAR(b)); } +#endif /* Create */ gdk_return @@ -587,9 +589,9 @@ STRMPcreate(BAT *b) return GDK_SUCCEED; /* Disable this before merging to default */ - if (isVIEW(b)) { + if (VIEWtparent(b)) { assert(b->tstrimps == NULL); - b = BBPdescriptor(VIEWtparent(b)); + b = BBP_cache(VIEWtparent(b)); } if ((h = STRMPcreateStrimpHeap(b)) == NULL) { @@ -605,6 +607,7 @@ STRMPcreate(BAT *b) else *dh++ = 0; /* no pairs in nil values */ } + bat_iterator_end(); h->strimps.free += b->batCount*sizeof(uint64_t); @@ -621,9 +624,9 @@ STRMPcreate(BAT *b) /* After we have computed the strimp, attempt to write it back * to the BAT. */ - MT_lock_set(>batIdxLock); - persistStrimp(b); - MT_lock_unset(>batIdxLock); + /* MT_lock_set(>batIdxLock); */ + /* persistStrimp(b); */ + /* MT_lock_unset(>batIdxLock); */ TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", GDKusec()-t0); return GDK_SUCCEED; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Fix strimp creation and filtering
Changeset: 7d48a7a8c479 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/7d48a7a8c479 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix strimp creation and filtering diffs (161 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -339,7 +339,7 @@ STRMPcreateStrimpHeap(BAT *b) /* Make sure no other thread got here first */ if (b->tstrimps == NULL) { STRMPbuildHeader(b, hpairs); /* Find the header pairs */ - sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor */ + sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor and the pair sizes */ for (i = 0; i < STRIMP_HEADER_SIZE; i++) { sz += hpairs[i].psize; } @@ -465,31 +465,42 @@ STRMPfilter(BAT *b, char *q) BUN i; uint64_t qbmask; uint64_t *ptr; + Strimps *strmps; - if (b->tstrimps == NULL) - goto sfilter_fail; + if (isVIEW(b)) { + // b = BBP_cache(VIEWtparent(b)); + BAT *pb = BBP_cache(VIEWtparent(b)); + if (!BATcheckstrimps(pb)) + goto sfilter_fail; + strmps = pb->tstrimps; + } + else { + if (!BATcheckstrimps(b)) + goto sfilter_fail; + strmps = b->tstrimps; + } r = COLnew(b->hseqbase, TYPE_oid, b->batCount, TRANSIENT); if (r == NULL) { goto sfilter_fail; } - if (!BATcheckstrimps(b)) { - BBPunfix(r->batCacheid); - goto sfilter_fail; - } - qbmask = STRMPmakebitstring(q, b->tstrimps); - ptr = (uint64_t *)b->tstrimps->strimps_base; + qbmask = STRMPmakebitstring(q, strmps); + ptr = (uint64_t *)strmps->strimps_base; for (i = 0; i < b->batCount; i++) { if ((*(ptr + i) & qbmask) == qbmask) { - oid pos = i; + oid pos = i + b->hseqbase; if (BUNappend(r, , false) != GDK_SUCCEED) goto sfilter_fail; } } r->tkey = true; + r->tsorted = true; + r->trevsorted = BATcount(r) <= 1; + r->tnil = false; + r->tnonil = true; return virtualize(r); @@ -497,7 +508,6 @@ STRMPfilter(BAT *b, char *q) return NULL; } -#if 0 static void BATstrimpsync(void *arg) { @@ -568,7 +578,8 @@ persistStrimp(BAT *b) } else TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT persisting strimp\n", ALGOBATPAR(b)); } -#endif + +static ATOMIC_TYPE STRMPnthread = ATOMIC_VAR_INIT(0); /* Create */ gdk_return @@ -580,27 +591,33 @@ STRMPcreate(BAT *b) str s; Strimps *h; uint64_t *dh; + BAT *pb; - assert(b->ttype == TYPE_str); TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); - + if (b->ttype != TYPE_str) { + GDKerror("strimps only valid for strings\n"); + return GDK_FAIL; + } - if (BATcheckstrimps(b)) + (void)ATOMIC_INC(); + /* Disable this before merging to default */ +if (VIEWtparent(b)) { + pb = BBP_cache(VIEWtparent(b)); + assert(pb); + } else { + pb = b; + } + + if (BATcheckstrimps(pb)) return GDK_SUCCEED; - /* Disable this before merging to default */ - if (VIEWtparent(b)) { - assert(b->tstrimps == NULL); - b = BBP_cache(VIEWtparent(b)); - } - - if ((h = STRMPcreateStrimpHeap(b)) == NULL) { +if ((h = STRMPcreateStrimpHeap(pb)) == NULL) { return GDK_FAIL; } - dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free); + dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + b->hseqbase*8); bi = bat_iterator(b); - for (i = 0; i < b->batCount; i++) { + for (i = 0; i < bi.count; i++) { s = (str)BUNtvar(bi, i); if (!strNil(s)) *dh++ = STRMPmakebitstring(s, h); @@ -608,25 +625,16 @@ STRMPcreate(BAT *b) *dh++ = 0; /* no pairs in nil values */ } bat_iterator_end(); + + MT_lock_set(>batIdxLock); h->strimps.free += b->batCount*sizeof(uint64_t); - + MT_lock_unset(>batIdxLock); -#ifndef NDEBUG - { - FILE *f = fopen("/tmp/strmp", "wb"); - if (f) { - fwrite(h->strimps.base, 1, h->strimps.free, f); - fclose(f); - } + /* The thread that reaches this point last needs to write the strimp to disk. */ + (void)ATOMIC_DEC(); + if (STRMPnthread ==
MonetDB: string_imprints - Fix strimp pointers when reading from...
Changeset: 7de11c47ea3d for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/7de11c47ea3d Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix strimp pointers when reading from disk diffs (23 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -77,8 +77,8 @@ /* Macros for accessing metadada of a strimp. These are recorded in the * first 8 bytes of the heap. */ -#define NPAIRS(d) (((d) & (0xff << 8)) >> 8) -#define HSIZE(d) (((d) & (0x << 15)) >> 15) +#define NPAIRS(d) ((d) >> 8) & 0xff +#define HSIZE(d) ((d) >> 16) & 0x #undef UTF8STRINGS /* Not using utf8 for now */ #ifdef UTF8STRINGS @@ -428,7 +428,7 @@ BATcheckstrimps(BAT *b) && HEAPload(>strimps, nme, "tstrimps", false) == GDK_SUCCEED) { hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */ hp->pairs_base = hp->sizes_base + npairs; /* pairs just after the offsets */ - hp->strimps_base = hp->sizes_base + hsize;/* bitmasks just after the pairs */ + hp->strimps_base = hp->strimps.base + hsize;/* bitmasks just after the pairs */ close(fd); hp->strimps.parentid = b->batCacheid; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: 129da86e9686 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/129da86e9686 Modified Files: gdk/gdk.h gdk/gdk_bbp.c gdk/gdk_private.h monetdb5/modules/mal/batExtensions.c sql/backends/monet5/sql.c Branch: string_imprints Log Message: Merge with default diffs (truncated from 126951 to 300 lines): diff --git a/clients/ChangeLog b/clients/ChangeLog --- a/clients/ChangeLog +++ b/clients/ChangeLog @@ -1,3 +1,10 @@ # ChangeLog file for clients # This file is updated with Maddlog +* Wed Aug 11 2021 Sjoerd Mullender +- A new output formatting mode was added to mclient. Use -fcsv-noquote + to produce a CSV (comma-separated values) output where the quote + characters have not been escapes. This can be useful when producing + a single column string output that should be saved as is, e.g. when + using the sys.dump_database() function. + diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -344,7 +344,7 @@ void HASHdestroy(BAT *b); BUN HASHlist(Hash *h, BUN i); BUN HASHprobe(const Hash *h, const void *v); void HEAP_free(Heap *heap, var_t block); -void HEAP_initialize(Heap *heap, size_t nbytes, size_t nprivate, int alignment); +gdk_return HEAP_initialize(Heap *heap, size_t nbytes, size_t nprivate, int alignment); var_t HEAP_malloc(BAT *b, size_t nbytes); void HEAPdecref(Heap *h, bool remove); gdk_return HEAPextend(Heap *h, size_t size, bool mayshare) __attribute__((__warn_unused_result__)); diff --git a/clients/mapiclient/mclient.1 b/clients/mapiclient/mclient.1 --- a/clients/mapiclient/mclient.1 +++ b/clients/mapiclient/mclient.1 @@ -204,6 +204,7 @@ The possible values are .BR expanded , .BR x , .BR csv , +.BR csv-noquote , .BR tab , .BR raw , .BR xml , @@ -212,6 +213,8 @@ and .BR rowcount . .B csv is comma-separated values; +.B csv-noquote +is comma-separated values without escaping any quotes; .B tab is tab-separated values; .B raw @@ -235,6 +238,10 @@ and is a variation on .B trash where only the number of affected rows is printed. +Normal \fBcsv\fP and \fBtab\fP formatting will use double quotes +around any fields that contain double quotes, white space or the +separator. The \fBcsv-noquote\fP format will prevent that and dump +the contents of the field without any interpretation. In addition to plain \fBcsv\fP, two other forms are possible. \fBcsv=\fP\fIc\fP uses \fIc\fP as column separator; \fBcsv+\fP\fIc\fP uses \fIc\fP as column separator and produces a single header line in diff --git a/clients/mapiclient/mclient.c b/clients/mapiclient/mclient.c --- a/clients/mapiclient/mclient.c +++ b/clients/mapiclient/mclient.c @@ -95,6 +95,7 @@ enum formatters { static enum formatters formatter = NOformatter; char *separator = NULL;/* column separator for CSV/TAB format */ bool csvheader = false;/* include header line in CSV format */ +bool noquote = false; /* don't use quotes in CSV format */ #define DEFWIDTH 80 @@ -942,7 +943,7 @@ CSVrenderer(MapiHdl hdl) while (!mnstr_errnr(toConsole) && (fields = fetch_row(hdl)) != 0) { for (i = 0; i < fields; i++) { s = mapi_fetch_field(hdl, i); - if (s != NULL && s[strcspn(s, specials)] != '\0') { + if (!noquote && s != NULL && s[strcspn(s, specials)] != '\0') { mnstr_printf(toConsole, "%s\"", i == 0 ? "" : separator); while (*s) { @@ -1647,6 +1648,7 @@ setFormatter(const char *s) free(separator); separator = NULL; csvheader = false; + noquote = false; #ifdef _TWO_DIGIT_EXPONENT if (formatter == TESTformatter) _set_output_format(0); @@ -1673,6 +1675,29 @@ setFormatter(const char *s) } else separator = strdup(s + 4); csvheader = true; + } else if (strcmp(s, "csv-noquote") == 0) { + noquote = true; + formatter = CSVformatter; + separator = strdup(","); + } else if (strncmp(s, "csv-noquote=", 12) == 0) { + noquote = true; + formatter = CSVformatter; + if (s[12] == '"') { + separator = strdup(s + 13); + if (separator[strlen(separator) - 1] == '"') + separator[strlen(separator) - 1] = 0; + } else + separator = strdup(s + 12); + } else if (strncmp(s, "csv-noquote+", 12) == 0) { + noquote = true; + formatter = CSVformatter; + if (s[12] == '"') { + separator = strdup(s + 13); + if (separator[strlen(separator) - 1] == '"') +
MonetDB: string_imprints - Fix strimp persistence
Changeset: b82dc476a040 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/b82dc476a040 Modified Files: gdk/gdk_bbp.c gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix strimp persistence diffs (40 lines): diff --git a/gdk/gdk_bbp.c b/gdk/gdk_bbp.c --- a/gdk/gdk_bbp.c +++ b/gdk/gdk_bbp.c @@ -4038,6 +4038,11 @@ BBPdiskscan(const char *parent, size_t b #else delete = true; #endif + } else if (strncmp(p + 1, "tstrimps", 8) == 0) { + BAT *b = getdesc(bid); + delete = b == NULL; + if (!delete) + b->tstrimps = (Strimps *)1; } else if (strncmp(p + 1, "new", 3) != 0) { ok = false; } diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -78,7 +78,7 @@ * first 8 bytes of the heap. */ #define NPAIRS(d) (((d) & (0xff << 8)) >> 8) -#define HSIZE(d) (((d) & (0x << 16)) >> 16) +#define HSIZE(d) (((d) & (0x << 15)) >> 15) #undef UTF8STRINGS /* Not using utf8 for now */ #ifdef UTF8STRINGS @@ -426,9 +426,9 @@ BATcheckstrimps(BAT *b) /* bitmasks */ BATcount(b)*(npairs/8)) && HEAPload(>strimps, nme, "tstrimps", false) == GDK_SUCCEED) { - hp->sizes_base = (uint8_t *)hp + 8; /* sizes start just after the descriptor */ - hp->pairs_base = hp->sizes_base + npairs; /* pairs start after the offsets */ - hp->strimps_base = hp->sizes_base + hsize; /* bitmasks start after the pairs */ + hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */ + hp->pairs_base = hp->sizes_base + npairs; /* pairs just after the offsets */ + hp->strimps_base = hp->sizes_base + hsize;/* bitmasks just after the pairs */ close(fd); hp->strimps.parentid = b->batCacheid; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Use candidate lists for strimps
Changeset: 3d18e45d5375 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/3d18e45d5375 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Use candidate lists for strimps We use candidate lists both for strimp creation and for filtering. diffs (136 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -242,20 +242,25 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai lng t0 = 0; BATiter bi; str cs; - BUN i; + BUN i, ncand; size_t hidx; + oid x; size_t hlen; PairHistogramElem *hist; PairIterator pi, *pip; CharPair cp, *cpp; + struct canditer ci; - (void)s; TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); hlen = STRIMP_HISTSIZE; if ((hist = (PairHistogramElem *)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) { - // TODO handle error - return 0; + return false; + } + + ncand = canditer_init(, b, s); + if (ncand == 0) { + return false; } for(hidx = 0; hidx < hlen; hidx++) { @@ -267,8 +272,9 @@ STRMPbuildHeader(BAT *b, BAT *s, CharPai bi = bat_iterator(b); pip = cpp = - for (i = 0; i < b->batCount; i++) { - cs = (str)BUNtvar(bi, i); + for (i = 0; i < ncand; i++) { + x = canditer_next() - b->hseqbase; + cs = (str)BUNtvar(bi, x); if (!strNil(cs)) { pi.s = cs; pi.pos = 0; @@ -339,8 +345,8 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) if (b->tstrimps == NULL) { MT_lock_set(>batIdxLock); /* Make sure no other thread got here first */ -if (b->tstrimps == NULL) { - STRMPbuildHeader(b, s, hpairs); /* Find the header pairs */ +if (b->tstrimps == NULL && + STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, put the result in hpairs */ sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor and the pair sizes */ for (i = 0; i < STRIMP_HEADER_SIZE; i++) { sz += hpairs[i].psize; @@ -464,14 +470,14 @@ BAT * STRMPfilter(BAT *b, BAT *s, char *q) { BAT *r = NULL; - BUN i; + BUN i, ncand; uint64_t qbmask; uint64_t *ptr; Strimps *strmps; - (void)s; + oid x; + struct canditer ci; if (isVIEW(b)) { - // b = BBP_cache(VIEWtparent(b)); BAT *pb = BBP_cache(VIEWtparent(b)); if (!BATcheckstrimps(pb)) goto sfilter_fail; @@ -483,17 +489,27 @@ STRMPfilter(BAT *b, BAT *s, char *q) strmps = b->tstrimps; } - r = COLnew(b->hseqbase, TYPE_oid, b->batCount, TRANSIENT); + ncand = canditer_init(, b, s); + if (ncand == 0) + /* Is this correct? */ + return BATdense(b->hseqbase, 0, 0); + r = COLnew(b->hseqbase, TYPE_oid, ncand, TRANSIENT); if (r == NULL) { goto sfilter_fail; } + /* TODO: Compare patterns with and without SQL pattern metachars +* (% and _). Theoretically they should produce the same results +* because bitstring creation ignores punctuation characters +* (see the macro isIgnored). +*/ qbmask = STRMPmakebitstring(q, strmps); ptr = (uint64_t *)strmps->strimps_base; - for (i = 0; i < b->batCount; i++) { - if ((*(ptr + i) & qbmask) == qbmask) { - oid pos = i + b->hseqbase; + for (i = 0; i < ncand; i++) { + x = canditer_next() - b->hseqbase; + if ((*(ptr + x) & qbmask) == qbmask) { + oid pos = x + b->hseqbase; if (BUNappend(r, , false) != GDK_SUCCEED) goto sfilter_fail; } @@ -590,11 +606,13 @@ STRMPcreate(BAT *b, BAT *s) { lng t0 = 0; BATiter bi; - BUN i; + BUN i, ncand; str cs; Strimps *h; uint64_t *dh; BAT *pb; + oid x; + struct canditer ci; TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); if (b->ttype != TYPE_str) { @@ -619,9 +637,12 @@ STRMPcreate(BAT *b, BAT *s) } dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + b->hseqbase*8); + ncand = canditer_init(, b, s); + bi = bat_iterator(b); - for (i = 0; i < bi.count; i++) { - cs = (str)BUNtvar(bi, i); + for (i = 0; i < ncand; i++) { + x = canditer_next() - b->hseqbase; + cs = (str)BUNtvar(bi, x); if (!strNil(cs)) *dh++ = STRMPmakebitstring(cs, h);
MonetDB: string_imprints - Allocate a new mal block in optimizer
Changeset: b77f70925fae for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/b77f70925fae Modified Files: monetdb5/optimizer/opt_pipes.c monetdb5/optimizer/opt_strimps.c Branch: string_imprints Log Message: Allocate a new mal block in optimizer Some fixes due to Martin: 1. Make sure the optimizer needs to run beforehand. 2. Allocate a new mal block and push instructions there. 3. Free the old mal block at the end of the run. diffs (120 lines): diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c --- a/monetdb5/optimizer/opt_pipes.c +++ b/monetdb5/optimizer/opt_pipes.c @@ -58,9 +58,9 @@ static struct PIPELINES { "optimizer.inline();" "optimizer.remap();" "optimizer.bincopyfrom();" -"optimizer.strimps();" "optimizer.deadcode();" "optimizer.multiplex();" +"optimizer.strimps();" "optimizer.generator();" "optimizer.profiler();" //"optimizer.candidates();" only for decoration in explain diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c --- a/monetdb5/optimizer/opt_strimps.c +++ b/monetdb5/optimizer/opt_strimps.c @@ -23,9 +23,9 @@ str OPTstrimpsImplementation(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { - int i, limit, needed =0, actions=0; + int i, limit, slimit, needed =0, actions=0; // int mvcvar = -1; - InstrPtr p,q,r, *old = mb->stmt; + InstrPtr p, q, *old = mb->stmt; char buf[256]; lng usec = GDKusec(); str msg = MAL_SUCCEED; @@ -37,63 +37,63 @@ OPTstrimpsImplementation(Client cntxt, M (void) cntxt; (void) stk; /* to fool compilers */ + limit= mb->stop; if ( mb->inlineProp ) return MAL_SUCCEED; - // check applicability first - for( i=0; i < limit; i++){ + for(i=0; i < limit; i++) { p = old[i]; - if ( getModuleId(p) == algebraRef && getFunctionId(p) == likeselectRef) + if (getModuleId(p) == algebraRef && getFunctionId(p) == likeselectRef) needed = 1; } + if (!needed) goto bailout; - limit= mb->stop; - if ( newMalBlkStmt(mb, mb->ssize + 20) < 0) + if (newMalBlkStmt(mb, mb->ssize + 20) < 0) throw(MAL,"optimizer.strimps", SQLSTATE(HY013) MAL_MALLOC_FAIL); + slimit = mb->stop; + for (i = 0; i < limit; i++) { p = old[i]; -if (p->token == ENDsymbol){ -pushInstruction(mb,p); -break; -} + if (p->token == ENDsymbol) { + pushInstruction(mb,p); + break; + } + /* Look for bind operations on strings, because for those we migh need strimps */ if (getModuleId(p) == algebraRef && getFunctionId(p) == likeselectRef) { - q = newInstruction(0, strimpsRef, mkstrimpsRef); /* This should be void? */ - setDestVar(q, newTmpVariable(mb, TYPE_void)); + + /* cst.vtype = TYPE_bit; */ + /* nvar = defConstant(mb, TYPE_bit, ); */ + q = newInstruction(mb, strimpsRef, strimpFilterSelectRef); + res = newTmpVariable(mb, newBatType(TYPE_oid)); + setDestVar(q, res); q = addArgument(mb, q, getArg(p, 1)); + q = addArgument(mb, q, getArg(p, 2)); + q = addArgument(mb, q, getArg(p, 3)); + q = addArgument(mb, q, getArg(p, 6)); pushInstruction(mb, q); typeChecker(cntxt->usermodule, mb, q, mb->stop-1, TRUE); - /* cst.vtype = TYPE_bit; */ - /* nvar = defConstant(mb, TYPE_bit, ); */ - r = newInstruction(mb, strimpsRef, strimpFilterSelectRef); - res = newTmpVariable(mb, newBatType(TYPE_oid)); - setDestVar(r, res); - r = addArgument(mb, r, getArg(p, 1)); - r = addArgument(mb, r, getArg(p, 2)); - r = addArgument(mb, r, getArg(p, 3)); - r = addArgument(mb, r, getArg(p, 6)); - - pushInstruction(mb, r); - // typeChecker(cntxt->usermodule, mb, r, mb->stop-1, TRUE); + p = setArgument(mb, p, 2, getArg(q, 0)); actions++; } pushInstruction(mb, p); } - for (; i < slimit; i++) - if (old[i]) - freeInstruction(old[i]); + (void)slimit; + /* for (; i < slimit; i++) */ + /* if (old[i]) */ + /* freeInstruction(old[i]);
MonetDB: string_imprints - strimp creation and filtering should ...
Changeset: 139143c85939 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/139143c85939 Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h monetdb5/modules/mal/strimps.c sql/backends/monet5/sql_strimps.c Branch: string_imprints Log Message: strimp creation and filtering should work with candidates This commit changes the interface of the GDK functions and how they are called. diffs (190 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -238,10 +238,10 @@ STRMPchoosePairs(PairHistogramElem *hist } static bool -STRMPbuildHeader(BAT *b, CharPair *hpairs) { +STRMPbuildHeader(BAT *b, BAT *s, CharPair *hpairs) { lng t0 = 0; BATiter bi; - str s; + str cs; BUN i; size_t hidx; size_t hlen; @@ -249,6 +249,8 @@ STRMPbuildHeader(BAT *b, CharPair *hpair PairIterator pi, *pip; CharPair cp, *cpp; + (void)s; + TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); hlen = STRIMP_HISTSIZE; if ((hist = (PairHistogramElem *)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) { @@ -266,9 +268,9 @@ STRMPbuildHeader(BAT *b, CharPair *hpair pip = cpp = for (i = 0; i < b->batCount; i++) { - s = (str)BUNtvar(bi, i); - if (!strNil(s)) { - pi.s = s; + cs = (str)BUNtvar(bi, i); + if (!strNil(cs)) { + pi.s = cs; pi.pos = 0; pi.lim = strlen(pi.s); while (pair_at(pip, cpp)) { @@ -324,7 +326,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair /* Create the heap for a string imprint. Returns NULL on failure. */ static Strimps * -STRMPcreateStrimpHeap(BAT *b) +STRMPcreateStrimpHeap(BAT *b, BAT *s) { uint8_t *h1, *h2; Strimps *r = NULL; @@ -338,7 +340,7 @@ STRMPcreateStrimpHeap(BAT *b) MT_lock_set(>batIdxLock); /* Make sure no other thread got here first */ if (b->tstrimps == NULL) { - STRMPbuildHeader(b, hpairs); /* Find the header pairs */ + STRMPbuildHeader(b, s, hpairs); /* Find the header pairs */ sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor and the pair sizes */ for (i = 0; i < STRIMP_HEADER_SIZE; i++) { sz += hpairs[i].psize; @@ -459,13 +461,14 @@ BATcheckstrimps(BAT *b) * list. */ BAT * -STRMPfilter(BAT *b, char *q) +STRMPfilter(BAT *b, BAT *s, char *q) { BAT *r = NULL; BUN i; uint64_t qbmask; uint64_t *ptr; Strimps *strmps; + (void)s; if (isVIEW(b)) { // b = BBP_cache(VIEWtparent(b)); @@ -583,12 +586,12 @@ static ATOMIC_TYPE STRMPnthread = ATOMIC /* Create */ gdk_return -STRMPcreate(BAT *b) +STRMPcreate(BAT *b, BAT *s) { lng t0 = 0; BATiter bi; BUN i; - str s; + str cs; Strimps *h; uint64_t *dh; BAT *pb; @@ -611,16 +614,16 @@ STRMPcreate(BAT *b) if (BATcheckstrimps(pb)) return GDK_SUCCEED; -if ((h = STRMPcreateStrimpHeap(pb)) == NULL) { +if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) { return GDK_FAIL; } dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + b->hseqbase*8); bi = bat_iterator(b); for (i = 0; i < bi.count; i++) { - s = (str)BUNtvar(bi, i); - if (!strNil(s)) - *dh++ = STRMPmakebitstring(s, h); + cs = (str)BUNtvar(bi, i); + if (!strNil(cs)) + *dh++ = STRMPmakebitstring(cs, h); else *dh++ = 0; /* no pairs in nil values */ } diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -42,6 +42,6 @@ typedef struct { // gdk_export gdk_return STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins); // make static // gdk_export gdk_return STRMP_make_header(StrimpHeader *h, uint64_t *hist, size_t hist_size); // make static // gdk_export gdk_return STRMP_make_header(BAT *b); -gdk_export gdk_return STRMPcreate(BAT *b); -gdk_export BAT *STRMPfilter(BAT *b, char *q); +gdk_export gdk_return STRMPcreate(BAT *b, BAT *s); +gdk_export BAT *STRMPfilter(BAT *b, BAT *s, char *q); #endif /* _GDK_STRIMPS_H_ */ diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c --- a/monetdb5/modules/mal/strimps.c +++ b/monetdb5/modules/mal/strimps.c @@ -79,8 +79,8 @@ PATstrimp_makehist(Client cntxt, MalBlkP static str PATstrimpCreate(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { - bat bid; - BAT *b; + bat bid, sid; + BAT *b, *s;
MonetDB: string_imprints - Add logging info to STRMPfilter
Changeset: c70da469b348 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/c70da469b348 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Add logging info to STRMPfilter diffs (23 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -476,6 +476,9 @@ STRMPfilter(BAT *b, BAT *s, char *q) Strimps *strmps; oid x; struct canditer ci; + lng t0 = 0; + + TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); if (isVIEW(b)) { BAT *pb = BBP_cache(VIEWtparent(b)); @@ -520,6 +523,9 @@ STRMPfilter(BAT *b, BAT *s, char *q) r->trevsorted = BATcount(r) <= 1; r->tnil = false; r->tnonil = true; + TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT + " items took " LLFMT " usec\n", ncand, GDKusec()-t0); + TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) ); return virtualize(r); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Fix initialization check for strimps
Changeset: 238caa07b6bc for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/238caa07b6bc Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix initialization check for strimps A strimp is initialized if the strimp exists and the number of bitstrings is equal to the bat count diffs (17 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -456,7 +456,12 @@ BATcheckstrimps(BAT *b) } MT_lock_unset(>batIdxLock); } -ret = b->tstrimps != NULL; + /* The string imprint is initialized if the strimp pointer is +* not null and the number of bitstrings is equal to the bat +* count. +*/ +ret = b->tstrimps != NULL && + (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount; if (ret) TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): already has strimps, waited " LLFMT " usec\n", ALGOBATPAR(b), GDKusec() - t); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Treat gdb_use_strimps as an integer
Changeset: 95d06449d0b1 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/95d06449d0b1 Modified Files: monetdb5/modules/mal/pcre.c Branch: string_imprints Log Message: Treat gdb_use_strimps as an integer diffs (29 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1873,7 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid, str msg = MAL_SUCCEED; char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; - bool use_strimps = GDKgetenv("gdk_use_strimps"); + bool use_strimps = GDKgetenv_int("gdk_use_strimps", 0); if ((b = BATdescriptor(*bid)) == NULL) { msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); @@ -1884,6 +1884,8 @@ PCRElikeselect(bat *ret, const bat *bid, goto bailout; } + assert(ATOMstorage(b->ttype) == TYPE_str); + if (use_strimps) { if (STRMPcreate(b, NULL) == GDK_SUCCEED) { BAT *tmp_s; @@ -1895,7 +1897,6 @@ PCRElikeselect(bat *ret, const bat *bid, } - assert(ATOMstorage(b->ttype) == TYPE_str); if ((msg = choose_like_path(, _re, _strcmp, , pat, esc)) != MAL_SUCCEED) goto bailout; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: 8d90a78cdb68 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/8d90a78cdb68 Modified Files: gdk/gdk.h gdk/gdk_bbp.c gdk/gdk_private.h monetdb5/optimizer/opt_prelude.c monetdb5/optimizer/opt_prelude.h sql/backends/monet5/sql.c sql/scripts/CMakeLists.txt Branch: string_imprints Log Message: Merge with default diffs (truncated from 32345 to 300 lines): diff --git a/MonetDB.spec b/MonetDB.spec --- a/MonetDB.spec +++ b/MonetDB.spec @@ -527,7 +527,6 @@ exit 0 %{_libdir}/monetdb5/lib_capi.so %endif %{_libdir}/monetdb5/lib_generator.so -%{_libdir}/monetdb5/lib_udf.so %doc %{_mandir}/man1/mserver5.1.gz %dir %{_datadir}/doc/MonetDB %docdir %{_datadir}/doc/MonetDB @@ -832,6 +831,7 @@ rm -f %{buildroot}%{_libdir}/monetdb5/ru rm -f %{buildroot}%{_libdir}/monetdb5/lib_run_*.so rm -f %{buildroot}%{_libdir}/monetdb5/microbenchmark.mal rm -f %{buildroot}%{_libdir}/monetdb5/lib_microbenchmark*.so +rm -f %{buildroot}%{_libdir}/monetdb5/lib_udf*.so rm -f %{buildroot}%{_bindir}/monetdb_mtest.sh rm -rf %{buildroot}%{_datadir}/monetdb # /cmake diff --git a/NT/mksqlwxs.py b/NT/mksqlwxs.py --- a/NT/mksqlwxs.py +++ b/NT/mksqlwxs.py @@ -191,9 +191,9 @@ def main(): print(r'') print(r' ') id = comp(features, id, 16, - [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: x.startswith('_') and x.endswith('.dll') and ('geom' not in x) and ('pyapi' not in x) and ('opt_sql_append' not in x) and ('run_' not in x) and ('microbenchmark' not in x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5']) + [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: x.startswith('_') and x.endswith('.dll') and ('geom' not in x) and ('pyapi' not in x) and ('opt_sql_append' not in x) and ('run_' not in x) and ('microbenchmark' not in x) and ('udf' not in x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5']) id = comp(debug, id, 16, - [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: x.startswith('_') and x.endswith('.pdb') and ('geom' not in x) and ('opt_sql_append' not in x) and ('run_' not in x) and ('microbenchmark' not in x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5']) + [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: x.startswith('_') and x.endswith('.pdb') and ('geom' not in x) and ('opt_sql_append' not in x) and ('run_' not in x) and ('microbenchmark' not in x) and ('udf' not in x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5']) id = comp(geom, id, 16, [r'lib\monetdb5\{}'.format(x) for x in sorted(filter(lambda x: x.startswith('_') and (x.endswith('.dll') or x.endswith('.pdb')) and ('geom' in x), os.listdir(os.path.join(sys.argv[3], 'lib', 'monetdb5']) id = comp(pyapi3, id, 16, diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -1,697 +1,694 @@ -stdout of test 'MAL-signatures` in directory 'clients` itself: - -#select * from sys.malfunctions() order by module, "function", address, signature, comment; % .%1, .%1,.%1,.%1,.%1 # table_name % module, function, signature, address,comment # name % clob,clob, clob, clob, clob # type -% 12, 28, 313,42, 0 # length -[ "aggr", "all", "command aggr.all(X_0:bat[:any_1]):any_1 ", "SQLall;", "" ] -[ "aggr", "allnotequal", "pattern aggr.allnotequal(X_0:bat[:any_1], X_1:bat[:any_1]):bit ", "SQLallnotequal;", "" ] +% 12, 28, 313,42, 860 # length +[ "aggr", "all", "command aggr.all(X_0:bat[:any_1]):any_1 ", "SQLall;", "if all values in b are equal return this, else nil"] +[ "aggr", "allnotequal", "pattern aggr.allnotequal(X_0:bat[:any_1], X_1:bat[:any_1]):bit ", "SQLallnotequal;", "if all values in r are not equal to l return true, else if r has nil nil else false" ] [ "aggr", "anyequal", "pattern aggr.anyequal(X_0:any_1, X_1:any_1):bit ", "CMDvarEQ;","" ] -[ "aggr", "anyequal", "pattern aggr.anyequal(X_0:bat[:any_1], X_1:bat[:any_1]):bit ", "SQLanyequal;", "" ] -[ "aggr", "avg", "command aggr.avg(X_0:bat[:bte], X_1:bat[:oid], X_2:bat[:any_1]):bat[:dbl] ", "AGGRavg13_dbl;", "" ] -[ "aggr", "avg", "command aggr.avg(X_0:bat[:dbl], X_1:bat[:oid], X_2:bat[:any_1]):bat[:dbl] ", "AGGRavg13_dbl;", "" ] -[ "aggr", "avg", "command aggr.avg(X_0:bat[:flt], X_1:bat[:oid], X_2:bat[:any_1]):bat[:dbl] ", "AGGRavg13_dbl;", "" ] -[ "aggr", "avg", "command aggr.avg(X_0:bat[:int], X_1:bat[:oid], X_2:bat[:any_1]):bat[:dbl] ", "AGGRavg13_dbl;", "" ] -[ "aggr", "avg", "command
MonetDB: string_imprints - Code cleanup
Changeset: 3ca155710c3f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/3ca155710c3f Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Code cleanup diffs (88 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -77,8 +77,8 @@ /* Macros for accessing metadada of a strimp. These are recorded in the * first 8 bytes of the heap. */ -#define NPAIRS(d) ((d) >> 8) & 0xff -#define HSIZE(d) ((d) >> 16) & 0x +#define NPAIRS(d) (((d) >> 8) & 0xff) +#define HSIZE(d) (((d) >> 16) & 0x) #undef UTF8STRINGS /* Not using utf8 for now */ #ifdef UTF8STRINGS @@ -371,9 +371,10 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE; for (i = 0; i < STRIMP_HEADER_SIZE; i++) { - *(h1 + i) = hpairs[i].psize; - memcpy(h2, hpairs[i].pbytes, hpairs[i].psize); - h2 += hpairs[i].psize; + uint8_t psize = hpairs[i].psize; + h1[i] = psize; + memcpy(h2, hpairs[i].pbytes, psize); + h2 += psize; } r->strimps_base = h2; r->strimps.free = sz; @@ -386,12 +387,20 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) return b->tstrimps; } +#define STRIMP_COMPLETE(b) \ + b->tstrimps != NULL &&\ + (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount + static bool BATcheckstrimps(BAT *b) { bool ret; lng t = GDKusec(); + if (b == NULL) + return false; + + assert(b->batCacheid > 0); if (b->tstrimps == (Strimps *)1) { assert(!GDKinmemory(b->theap->farmid)); MT_lock_set(>batIdxLock); @@ -422,7 +431,7 @@ BATcheckstrimps(BAT *b) && (desc & 0xff) == STRIMP_VERSION && ((npairs = NPAIRS(desc)) == 32 || npairs == 64) && (hsize = HSIZE(desc)) >= 96 && hsize <= 640 - && ((desc & ((uint64_t)0xff << 32)) >> 32) == 1 + && ((desc >> 32) & 0xff) == 1 /* check the persistence byte */ && fstat(fd, ) == 0 && st.st_size >= (off_t) (hp->strimps.free = hp->strimps.size = /* descriptor */ @@ -460,12 +469,15 @@ BATcheckstrimps(BAT *b) * not null and the number of bitstrings is equal to the bat * count. */ -ret = b->tstrimps != NULL && - (b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount; - if (ret) - TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): already has strimps, waited " LLFMT " usec\n", ALGOBATPAR(b), GDKusec() - t); + assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t *)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount); + ret = STRIMP_COMPLETE(b); +if (ret) { + TRC_DEBUG(ACCELERATOR, + "BATcheckstrimps(" ALGOBATFMT "): already has strimps, waited " LLFMT " usec\n", + ALGOBATPAR(b), GDKusec() - t); + } - return ret; +return ret; } /* Filter a BAT b using a string q. Return the result as a candidate @@ -582,7 +594,7 @@ BATstrimpsync(void *arg) failed = ""; } } - TRC_DEBUG(ACCELERATOR, "BATstrimpsync(%s): strimps persisted" + TRC_DEBUG(ACCELERATOR, "BATstrimpsync(%s): strimp persisted" " (" LLFMT " usec)%s\n", BATgetId(b), GDKusec() - t0, failed); } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Do not create strimps for small bats
Changeset: 2e33639ce402 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/2e33639ce402 Modified Files: gdk/gdk_strimps.h monetdb5/modules/mal/pcre.c Branch: string_imprints Log Message: Do not create strimps for small bats diffs (32 lines): diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -15,6 +15,7 @@ #define STRIMP_VERSION (uint64_t)1 #define STRIMP_HISTSIZE 256*256 #define STRIMP_HEADER_SIZE 64 +#define STRIMP_CREATION_THRESHOLD 5000 /* do not create strimp for "small" BATs */ typedef struct { uint8_t *pbytes; diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1886,13 +1886,14 @@ PCRElikeselect(bat *ret, const bat *bid, assert(ATOMstorage(b->ttype) == TYPE_str); - if (use_strimps) { + if (use_strimps && BATcount(b) >= STRIMP_CREATION_THRESHOLD) { if (STRMPcreate(b, NULL) == GDK_SUCCEED) { BAT *tmp_s; tmp_s = STRMPfilter(b, s, *pat); - if(s) + if (tmp_s && s) { BBPunfix(s->batCacheid); - s = tmp_s; + s = tmp_s; + } } /* If we cannot create the strimp just continue normally */ } ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Fix reading strimps from the disk
Changeset: 1d265612c715 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/1d265612c715 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix reading strimps from the disk diffs (44 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -423,25 +423,29 @@ BATcheckstrimps(BAT *b) uint64_t npairs; uint64_t hsize; /* Read the 8 byte long strimp -* descriptor and make sure that -* the number of pairs is either -* 32 or 64. +* descriptor. +* +* NPAIRS must be 64 in the +* current implementation. +* +* HSIZE must be between 200 and +* 584 (inclusive): 8 bytes the +* descritor, 64 bytes the pair +* sizes and n*64 bytes the +* actual pairs where 2 <= n <= +* 8. */ if (read(fd, , 8) == 8 && (desc & 0xff) == STRIMP_VERSION - && ((npairs = NPAIRS(desc)) == 32 || npairs == 64) - && (hsize = HSIZE(desc)) >= 96 && hsize <= 640 + && ((npairs = NPAIRS(desc)) == 64) + && (hsize = HSIZE(desc)) >= 200 && hsize <= 584 && ((desc >> 32) & 0xff) == 1 /* check the persistence byte */ && fstat(fd, ) == 0 && st.st_size >= (off_t) (hp->strimps.free = hp->strimps.size = - /* descriptor */ - 8 + - /* header size (offsets + pairs) */ + /* header size (desc + offsets + pairs) */ hsize + - /* padding to 4 or 8 byte boundary */ - hsize%(npairs/8) == (npairs/8)? 0 : (npairs/8+((npairs/8) - hsize%(npairs/8))) + /* bitmasks */ - BATcount(b)*(npairs/8)) + BATcount(b)*sizeof(uint64_t)) && HEAPload(>strimps, nme, "tstrimps", false) == GDK_SUCCEED) { hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */ hp->pairs_base = hp->sizes_base + npairs; /* pairs just after the offsets */ ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Persist strimp when it is fully created
Changeset: 7bd939f4ba52 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/7bd939f4ba52 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Persist strimp when it is fully created A strimp is fully created when we have computed a bitstring for every string in the bat. diffs (32 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -621,8 +621,6 @@ persistStrimp(BAT *b) TRC_DEBUG(ACCELERATOR, "persistStrimp(" ALGOBATFMT "): NOT persisting strimp\n", ALGOBATPAR(b)); } -static ATOMIC_TYPE STRMPnthread = ATOMIC_VAR_INIT(0); - /* Create */ gdk_return STRMPcreate(BAT *b, BAT *s) @@ -643,7 +641,6 @@ STRMPcreate(BAT *b, BAT *s) return GDK_FAIL; } - (void)ATOMIC_INC(); /* Disable this before merging to default */ if (VIEWtparent(b)) { pb = BBP_cache(VIEWtparent(b)); @@ -678,10 +675,8 @@ STRMPcreate(BAT *b, BAT *s) MT_lock_unset(>batIdxLock); /* The thread that reaches this point last needs to write the strimp to disk. */ - (void)ATOMIC_DEC(); - if (STRMPnthread == 0) { + if (STRIMP_COMPLETE(pb)) persistStrimp(pb); - } TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", GDKusec()-t0); return GDK_SUCCEED; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Use correct address to bitstrings
Changeset: 97a1a209934a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/97a1a209934a Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Use correct address to bitstrings Specifically do not use the value of strimps.free because it is updated by multiple threads. diffs (12 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -655,7 +655,7 @@ STRMPcreate(BAT *b, BAT *s) if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) { return GDK_FAIL; } - dh = (uint64_t *)((uint8_t*)h->strimps.base + h->strimps.free + b->hseqbase*8); + dh = (uint64_t *)h->strimps_base + b->hseqbase; ncand = canditer_init(, b, s); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Add a GDK env var to enable use of st...
Changeset: c3fda7365a01 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/c3fda7365a01 Modified Files: monetdb5/modules/mal/pcre.c Branch: string_imprints Log Message: Add a GDK env var to enable use of strimps diffs (12 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1873,7 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid, str msg = MAL_SUCCEED; char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; - bool use_strimps = true; + bool use_strimps = GDKgetenv("gdk_use_strimps"); if ((b = BATdescriptor(*bid)) == NULL) { msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Fix mitosis bug in strimp filtering
Changeset: 49164109a169 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/49164109a169 Modified Files: gdk/gdk_strimps.c monetdb5/modules/mal/pcre.c Branch: string_imprints Log Message: Fix mitosis bug in strimp filtering diffs (70 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -472,7 +472,7 @@ STRMPfilter(BAT *b, BAT *s, char *q) BAT *r = NULL; BUN i, ncand; uint64_t qbmask; - uint64_t *ptr; + uint64_t *bitstring_array; Strimps *strmps; oid x; struct canditer ci; @@ -491,7 +491,6 @@ STRMPfilter(BAT *b, BAT *s, char *q) ncand = canditer_init(, b, s); if (ncand == 0) - /* Is this correct? */ return BATdense(b->hseqbase, 0, 0); r = COLnew(b->hseqbase, TYPE_oid, ncand, TRANSIENT); if (r == NULL) { @@ -504,14 +503,15 @@ STRMPfilter(BAT *b, BAT *s, char *q) * (see the macro isIgnored). */ qbmask = STRMPmakebitstring(q, strmps); - ptr = (uint64_t *)strmps->strimps_base; + bitstring_array = (uint64_t *)strmps->strimps_base; for (i = 0; i < ncand; i++) { - x = canditer_next() - b->hseqbase; - if ((*(ptr + x) & qbmask) == qbmask) { - oid pos = x + b->hseqbase; - if (BUNappend(r, , false) != GDK_SUCCEED) + x = canditer_next(); + if ((bitstring_array[x] & qbmask) == qbmask) { + if (BUNappend(r, , false) != GDK_SUCCEED) { + BBPunfix(r->batCacheid); goto sfilter_fail; + } } } diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1873,6 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid, str msg = MAL_SUCCEED; char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; + bool use_strimps = true; if ((b = BATdescriptor(*bid)) == NULL) { msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); @@ -1883,6 +1884,17 @@ PCRElikeselect(bat *ret, const bat *bid, goto bailout; } + if (use_strimps) { + if (STRMPcreate(b, NULL) == GDK_SUCCEED) { + BAT *tmp_s; + tmp_s = STRMPfilter(b, s, *pat); + if(s) + BBPunfix(s->batCacheid); + s = tmp_s; + } /* If we cannot create the strimp just continue normally */ + + } + assert(ATOMstorage(b->ttype) == TYPE_str); if ((msg = choose_like_path(, _re, _strcmp, , pat, esc)) != MAL_SUCCEED) goto bailout; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Remove unused code
Changeset: e2e6c0d4dbf4 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/e2e6c0d4dbf4 Modified Files: monetdb5/modules/mal/batExtensions.c Branch: string_imprints Log Message: Remove unused code diffs (76 lines): diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -282,72 +282,6 @@ CMDBATappend_bulk(Client cntxt, MalBlkPt return MAL_SUCCEED; } -#if 0 -/* - * String imprints. - */ -static str -PATstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) -{ - bat bid; - BAT *b; - size_t n; - - (void)cntxt; - (void)mb; - - // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED); - bid = *getArgReference_bat(stk, pci, 1); - if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - - if (!STRMPndigrams(b, )) { - throw(MAL, "bat.strimpDigrams", SQLSTATE(HY002) OPERATION_FAILED); - } - - *getArgReference_lng(stk, pci, 0) = n; - - return MAL_SUCCEED; -} - -static str -PATstrimp_makehist(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) -{ - bat bid; - BAT *b, *ob; - size_t i; - uint64_t hist[STRIMP_HISTSIZE]; - size_t count; - - (void)cntxt; - (void)mb; - - bid = *getArgReference_bat(stk, pci, 2); - if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); - - if (!STRMPmakehistogram(b, hist, STRIMP_HISTSIZE, )) { - throw(MAL, "bat.strimpHistogram", SQLSTATE(HY002) OPERATION_FAILED); - } - - ob = COLnew(0, TYPE_lng, STRIMP_HISTSIZE, TRANSIENT); - if (ob == NULL) { - throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) MAL_MALLOC_FAIL); - } - - for (i=0; i < STRIMP_HISTSIZE; i++) { - if (BUNappend(ob, hist + i, false) != GDK_SUCCEED) - throw(MAL, "bat.strimpHistogram", SQLSTATE(HY013) MAL_MALLOC_FAIL); - } - - *getArgReference_lng(stk, pci, 0) = count; - *getArgReference_bat(stk, pci, 1) = ob->batCacheid; - - BBPkeepref(ob->batCacheid); - return MAL_SUCCEED; -} -#endif - #include "mel.h" mel_func batExtensions_init_funcs[] = { pattern("bat", "new", CMDBATnew, false, "", args(1,2, batargany("",1),argany("tt",1))), ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Scaffolding for strimp optimizer
Changeset: 8ea09480b1ea for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/8ea09480b1ea Added Files: monetdb5/optimizer/opt_strimps.c monetdb5/optimizer/opt_strimps.h Modified Files: monetdb5/modules/mal/strimps.c monetdb5/optimizer/CMakeLists.txt monetdb5/optimizer/opt_pipes.c monetdb5/optimizer/opt_prelude.c monetdb5/optimizer/opt_prelude.h monetdb5/optimizer/opt_wrapper.c monetdb5/optimizer/optimizer.c Branch: string_imprints Log Message: Scaffolding for strimp optimizer diffs (242 lines): diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c --- a/monetdb5/modules/mal/strimps.c +++ b/monetdb5/modules/mal/strimps.c @@ -86,10 +86,10 @@ PATstrimpCreate(Client cntxt, MalBlkPtr bid = *getArgReference_bat(stk, pci, 1); if ((b = BATdescriptor(bid)) == NULL) - throw(MAL, "strimps.strimpHeader", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + throw(MAL, "strimps.strimpCreate", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); if(STRMPcreate(b) != GDK_SUCCEED) - throw(MAL, "strimps.strimpHistogram", SQLSTATE(HY002) OPERATION_FAILED); + throw(MAL, "strimps.strimpCreate", SQLSTATE(HY002) OPERATION_FAILED); // *getArgReference_lng(stk, pci, 0) = 0; return MAL_SUCCEED; diff --git a/monetdb5/optimizer/CMakeLists.txt b/monetdb5/optimizer/CMakeLists.txt --- a/monetdb5/optimizer/CMakeLists.txt +++ b/monetdb5/optimizer/CMakeLists.txt @@ -49,6 +49,7 @@ target_sources(optimizer opt_postfix.c opt_postfix.h opt_volcano.c opt_volcano.h opt_fastpath.c opt_fastpath.h + opt_strimps.c opt_strimps.h opt_wrapper.c PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/opt_pipes.h) diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c --- a/monetdb5/optimizer/opt_pipes.c +++ b/monetdb5/optimizer/opt_pipes.c @@ -49,7 +49,20 @@ static struct PIPELINES { "optimizer.deadcode();" "optimizer.multiplex();" "optimizer.generator();" -"optimizer.profiler();" +"optimizer.profiler();" +//"optimizer.candidates();" only for decoration in explain +//"optimizer.mask();" +"optimizer.garbageCollector();", +"stable", NULL, 1}, + {"minimal_strimps_pipe", +"optimizer.inline();" +"optimizer.remap();" +"optimizer.bincopyfrom();" +"optimizer.strimps();" +"optimizer.deadcode();" +"optimizer.multiplex();" +"optimizer.generator();" +"optimizer.profiler();" //"optimizer.candidates();" only for decoration in explain //"optimizer.mask();" "optimizer.garbageCollector();", diff --git a/monetdb5/optimizer/opt_prelude.c b/monetdb5/optimizer/opt_prelude.c --- a/monetdb5/optimizer/opt_prelude.c +++ b/monetdb5/optimizer/opt_prelude.c @@ -188,6 +188,7 @@ const char *minusRef; const char *mirrorRef; const char *mitosisRef; const char *mkeyRef; +const char *mkstrimpsRef; const char *mmathRef; const char *modRef; const char *mtimeRef; @@ -279,6 +280,8 @@ const char *startRef; const char *starttraceRef; const char *stoptraceRef; const char *streamsRef; +const char *strimpFilterSelectRef; +const char *strimpsRef; const char *strRef; const char *subavgRef; const char *subcountRef; @@ -495,6 +498,7 @@ void optimizerInit(void) mirrorRef = putName("mirror"); mitosisRef = putName("mitosis"); mkeyRef = putName("mkey"); + mkstrimpsRef = putName("mkstrimp"); mmathRef = putName("mmath"); modRef = putName("%"); mtimeRef = putName("mtime"); @@ -586,6 +590,7 @@ void optimizerInit(void) starttraceRef = putName("starttrace"); stoptraceRef = putName("stoptrace"); streamsRef = putName("streams"); + strimpsRef = putName("strimps"); strRef = putName("str"); subavgRef = putName("subavg"); subcountRef = putName("subcount"); diff --git a/monetdb5/optimizer/opt_prelude.h b/monetdb5/optimizer/opt_prelude.h --- a/monetdb5/optimizer/opt_prelude.h +++ b/monetdb5/optimizer/opt_prelude.h @@ -186,6 +186,7 @@ mal_export const char *minusRef; mal_export const char *mirrorRef; mal_export const char *mitosisRef; mal_export const char *mkeyRef; +mal_export const char *mkstrimpsRef; mal_export const char *mmathRef; mal_export const char *modRef; mal_export const char *mtimeRef; @@ -278,6 +279,8 @@ mal_export const char *startRef; mal_export const char *starttraceRef; mal_export const char *stoptraceRef; mal_export const char *streamsRef; +mal_export const char *strimpFilterSelectRef; +mal_export const char *strimpsRef; mal_export const char *strRef; mal_export const char *subavgRef; mal_export const char *subcountRef; diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c new file mode 100644 --- /dev/null +++ b/monetdb5/optimizer/opt_strimps.c @@
MonetDB: string_imprints - Add filtering to the plan (WIP)
Changeset: 2569ffa9872b for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/2569ffa9872b Modified Files: monetdb5/optimizer/opt_prelude.c monetdb5/optimizer/opt_strimps.c Branch: string_imprints Log Message: Add filtering to the plan (WIP) diffs (63 lines): diff --git a/monetdb5/optimizer/opt_prelude.c b/monetdb5/optimizer/opt_prelude.c --- a/monetdb5/optimizer/opt_prelude.c +++ b/monetdb5/optimizer/opt_prelude.c @@ -591,6 +591,7 @@ void optimizerInit(void) stoptraceRef = putName("stoptrace"); streamsRef = putName("streams"); strimpsRef = putName("strimps"); + strimpFilterSelectRef = putName("strimpfilterselect"); strRef = putName("str"); subavgRef = putName("subavg"); subcountRef = putName("subcount"); diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c --- a/monetdb5/optimizer/opt_strimps.c +++ b/monetdb5/optimizer/opt_strimps.c @@ -26,10 +26,13 @@ OPTstrimpsImplementation(Client cntxt, M int i, limit; // int mvcvar = -1; int count=0; - InstrPtr p,q, *old = mb->stmt; + InstrPtr p,q,r, *old = mb->stmt; char buf[256]; lng usec = GDKusec(); str msg = MAL_SUCCEED; + /* int res, nvar; */ + /* ValRecord cst; */ + int res; (void) pci; (void) cntxt; @@ -53,10 +56,24 @@ OPTstrimpsImplementation(Client cntxt, M pushInstruction(mb, q); typeChecker(cntxt->usermodule, mb, q, mb->stop-1, TRUE); + /* cst.vtype = TYPE_bit; */ + /* nvar = defConstant(mb, TYPE_bit, ); */ + r = newInstruction(mb, strimpsRef, strimpFilterSelectRef); + res = newTmpVariable(mb, newBatType(TYPE_oid)); + setDestVar(r, res); + r = addArgument(mb, r, getArg(p, 1)); + r = addArgument(mb, r, getArg(p, 2)); + r = addArgument(mb, r, getArg(p, 3)); + r = addArgument(mb, r, getArg(p, 6)); + + pushInstruction(mb, r); + // typeChecker(cntxt->usermodule, mb, r, mb->stop-1, TRUE); + count++; } pushInstruction(mb, p); } + GDKfree(old); /* Defense line against incorrect plans */ if( count){ @@ -68,7 +85,7 @@ OPTstrimpsImplementation(Client cntxt, M } /* keep all actions taken as a post block comment */ usec = GDKusec()- usec; -snprintf(buf,256,"%-20s actions=%2d time=" LLFMT " usec","volcano",count,usec); +snprintf(buf,256,"%-20s actions=%2d time=" LLFMT " usec","strimps",count,usec); newComment(mb,buf); if( count > 0) addtoMalBlkHistory(mb); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Use BUNFMT to output BUN values
Changeset: c5f5568d25b1 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/c5f5568d25b1 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Use BUNFMT to output BUN values diffs (14 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -556,8 +556,8 @@ STRMPfilter(BAT *b, BAT *s, char *q) r->trevsorted = BATcount(r) <= 1; r->tnil = false; r->tnonil = true; - TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT - " items took " LLFMT " usec. Keeping " LLFMT + TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " BUNFMT + " items took " LLFMT " usec. Keeping " BUNFMT " items (%.2f%%).\n", ncand, GDKusec()-t0, r->batCount, 100*r->batCount/(double)ncand); TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) ); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Fix some whitespace inconsistency
Changeset: 459f85f095b5 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/459f85f095b5 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Fix some whitespace inconsistency diffs (118 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -225,7 +225,7 @@ STRMPchoosePairs(PairHistogramElem *hist if (max_counts[cmin_max] < hist[i].cnt) { max_counts[cmin_max] = hist[i].cnt; indices[cmin_max] = i; -for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { + for(hidx = cmin_max; hidx > 0 && max_counts[hidx] > max_counts[hidx-1]; hidx--) { swp(max_counts, hidx, hidx-1, uint64_t); swp(indices, hidx, hidx-1, size_t); } @@ -350,7 +350,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) if (b->tstrimps == NULL) { MT_lock_set(>batIdxLock); /* Make sure no other thread got here first */ -if (b->tstrimps == NULL && + if (b->tstrimps == NULL && STRMPbuildHeader(b, s, hpairs)) { /* Find the header pairs, put the result in hpairs */ sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor and the pair sizes */ for (i = 0; i < STRIMP_HEADER_SIZE; i++) { @@ -389,7 +389,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) } MT_lock_unset(>batIdxLock); } -return b->tstrimps; + return b->tstrimps; } /* This macro takes a bat and checks if the strimp construction has been @@ -480,20 +480,20 @@ BATcheckstrimps(BAT *b) GDKclrerr();/* we're not currently interested in errors */ } MT_lock_unset(>batIdxLock); -} + } /* The string imprint is initialized if the strimp pointer is * not null and the number of bitstrings is equal to the bat * count. */ assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t *)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount); ret = STRIMP_COMPLETE(b); -if (ret) { + if (ret) { TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): already has strimps, waited " LLFMT " usec\n", ALGOBATPAR(b), GDKusec() - t); } -return ret; + return ret; } /* Filter a BAT b using a string q. Return the result as a candidate @@ -544,23 +544,23 @@ STRMPfilter(BAT *b, BAT *s, char *q) for (i = 0; i < ncand; i++) { x = canditer_next(); if ((bitstring_array[x] & qbmask) == qbmask) { - if (BUNappend(r, , false) != GDK_SUCCEED) { -BBPunfix(r->batCacheid); -goto sfilter_fail; - } -} -} + if (BUNappend(r, , false) != GDK_SUCCEED) { + BBPunfix(r->batCacheid); + goto sfilter_fail; + } + } + } -r->tkey = true; -r->tsorted = true; -r->trevsorted = BATcount(r) <= 1; -r->tnil = false; -r->tnonil = true; -TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT + r->tkey = true; + r->tsorted = true; + r->trevsorted = BATcount(r) <= 1; + r->tnil = false; + r->tnonil = true; + TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT " items took " LLFMT " usec. Keeping " LLFMT " items (%.2f%%).\n", ncand, GDKusec()-t0, r->batCount, 100*r->batCount/(double)ncand); -TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) ); + TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) ); return virtualize(r); sfilter_fail: @@ -659,7 +659,7 @@ STRMPcreate(BAT *b, BAT *s) } /* Disable this before merging to default */ -if (VIEWtparent(b)) { + if (VIEWtparent(b)) { pb = BBP_cache(VIEWtparent(b)); assert(pb); } else { @@ -669,7 +669,7 @@ STRMPcreate(BAT *b, BAT *s) if (BATcheckstrimps(pb)) return GDK_SUCCEED; -if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) { + if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) { return GDK_FAIL; } dh = (uint64_t *)h->strimps_base + b->hseqbase; @@ -871,7 +871,7 @@ STRMPndigrams(BAT *b, size_t *n) *n = 0; for (i = 0; i < b->batCount; i++) { s = (char *)BUNtail(bi, i); -// *n += STRMP_strlen(s) - 1; + // *n += STRMP_strlen(s) - 1;
MonetDB: string_imprints - Merge with default
Changeset: 5d4525349513 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/5d4525349513 Modified Files: gdk/CMakeLists.txt gdk/gdk.h gdk/gdk_bbp.c gdk/gdk_private.h sql/backends/monet5/CMakeLists.txt sql/backends/monet5/sql.c sql/scripts/CMakeLists.txt Branch: string_imprints Log Message: Merge with default diffs (truncated from 510485 to 300 lines): diff --git a/common/utils/matomic.h b/common/utils/matomic.h --- a/common/utils/matomic.h +++ b/common/utils/matomic.h @@ -75,7 +75,7 @@ typedef volatile atomic_ulong ATOMIC_TYP #define ATOMIC_INIT(var, val) atomic_init(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_DESTROY(var)((void) 0) -#define ATOMIC_GET(var)atomic_load(var) +#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) atomic_load(var)) #define ATOMIC_SET(var, val) atomic_store(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_XCG(var, val) atomic_exchange(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_CAS(var, exp, des) atomic_compare_exchange_strong(var, exp, (ATOMIC_BASE_TYPE) (des)) @@ -134,7 +134,7 @@ typedef __declspec(align(8)) volatile AT #if SIZEOF_SIZE_T == 8 #ifdef __INTEL_COMPILER -#define ATOMIC_GET(var)_InterlockedExchangeAdd64(var, 0) +#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) _InterlockedExchangeAdd64(var, 0)) #else #define ATOMIC_GET(var)(*(var)) /* should we use _InterlockedExchangeAdd64(var, 0) instead? */ @@ -162,7 +162,7 @@ ATOMIC_CAS(ATOMIC_TYPE *var, ATOMIC_BASE #else #ifdef DECLSPEC_NOINITALL -#define ATOMIC_GET(var) _InlineInterlockedExchangeAdd64(var, 0) +#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) _InlineInterlockedExchangeAdd64(var, 0)) #define ATOMIC_SET(var, val) _InlineInterlockedExchange64(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_XCG(var, val) _InlineInterlockedExchange64(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_ADD(var, val) _InlineInterlockedExchangeAdd64(var, (ATOMIC_BASE_TYPE) (val)) @@ -172,7 +172,7 @@ ATOMIC_CAS(ATOMIC_TYPE *var, ATOMIC_BASE #define ATOMIC_OR(var, val)_InlineInterlockedOr64(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_AND(var, val) _InlineInterlockedAnd64(var, (ATOMIC_BASE_TYPE) (val)) #else -#define ATOMIC_GET(var)_InterlockedExchangeAdd64(var, 0) +#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) _InterlockedExchangeAdd64(var, 0)) #define ATOMIC_SET(var, val) _InterlockedExchange64(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_XCG(var, val) _InterlockedExchange64(var, (ATOMIC_BASE_TYPE) (val)) #define ATOMIC_ADD(var, val) _InterlockedExchangeAdd64(var, (ATOMIC_BASE_TYPE) (val)) @@ -233,7 +233,7 @@ typedef volatile ATOMIC_BASE_TYPE ATOMIC #define ATOMIC_INIT(var, val) (*(var) = (val)) #define ATOMIC_DESTROY(var)((void) 0) -#define ATOMIC_GET(var)__atomic_load_n(var, __ATOMIC_SEQ_CST) +#define ATOMIC_GET(var)((ATOMIC_BASE_TYPE) __atomic_load_n(var, __ATOMIC_SEQ_CST)) #define ATOMIC_SET(var, val) __atomic_store_n(var, (ATOMIC_BASE_TYPE) (val), __ATOMIC_SEQ_CST) #define ATOMIC_XCG(var, val) __atomic_exchange_n(var, (ATOMIC_BASE_TYPE) (val), __ATOMIC_SEQ_CST) #define ATOMIC_CAS(var, exp, des) __atomic_compare_exchange_n(var, exp, (ATOMIC_BASE_TYPE) (des), false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt --- a/gdk/CMakeLists.txt +++ b/gdk/CMakeLists.txt @@ -18,6 +18,7 @@ set(gdk_public_headers $ $ $ + $ $ $ $ @@ -28,7 +29,8 @@ set(gdk_public_headers $ $ $ - $) + $ + $) add_library(bat SHARED) diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -863,9 +863,6 @@ mskGetVal(BAT *b, BUN p) * HEAPcopy (Heap *dst,*src); * @item int * @tab - * HEAPdelete (Heap *dst, str o, str ext); - * @item int - * @tab * HEAPwarm (Heap *h); * @end multitable * diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c --- a/gdk/gdk_align.c +++ b/gdk/gdk_align.c @@ -91,7 +91,7 @@ VIEWcreate(oid seq, BAT *b) return BATdense(seq, b->tseqbase, b->batCount); } - bn = BATcreatedesc(seq, b->ttype, false, TRANSIENT); + bn = BATcreatedesc(seq, b->ttype, false, TRANSIENT, 0); if (bn == NULL) return NULL; assert(bn->theap == NULL); @@ -193,8 +193,7 @@ BATmaterialize(BAT *b) .parentid = b->batCacheid, .dirty = true, }; - strconcat_len(tail->filename, sizeof(tail->filename), - BBP_physical(b->batCacheid), ".tail", NULL); + settailname(tail, BBP_physical(b->batCacheid), TYPE_oid, 0); if (HEAPalloc(tail, cnt, sizeof(oid), 0) != GDK_SUCCEED) { GDKfree(tail); return GDK_FAIL;
MonetDB: string_imprints - Remove assertion
Changeset: a257bb9341c6 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/a257bb9341c6 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Remove assertion diffs (23 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -485,7 +485,7 @@ BATcheckstrimps(BAT *b) * not null and the number of bitstrings is equal to the bat * count. */ - assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t *)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount); + // assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t *)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount); ret = STRIMP_COMPLETE(b); if (ret) { TRC_DEBUG(ACCELERATOR, @@ -692,8 +692,9 @@ STRMPcreate(BAT *b, BAT *s) MT_lock_unset(>batIdxLock); /* The thread that reaches this point last needs to write the strimp to disk. */ - if (STRIMP_COMPLETE(pb)) + if (STRIMP_COMPLETE(pb)) { persistStrimp(pb); + } TRC_DEBUG(ACCELERATOR, "strimp creation took " LLFMT " usec\n", GDKusec()-t0); return GDK_SUCCEED; ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Change gdk_use_strimps to yes-no vari...
Changeset: 9ee8a080cd94 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/9ee8a080cd94 Modified Files: monetdb5/modules/mal/pcre.c Branch: string_imprints Log Message: Change gdk_use_strimps to yes-no variable diffs (12 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1873,7 +1873,7 @@ PCRElikeselect(bat *ret, const bat *bid, str msg = MAL_SUCCEED; char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; - bool use_strimps = GDKgetenv_int("gdk_use_strimps", 0); + bool use_strimps = GDKgetenv_isyes("gdk_use_strimps"); if ((b = BATdescriptor(*bid)) == NULL) { msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - When filtering log how many elements ...
Changeset: 23861925c0dc for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/23861925c0dc Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: When filtering log how many elements remain diffs (41 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -532,21 +532,23 @@ STRMPfilter(BAT *b, BAT *s, char *q) for (i = 0; i < ncand; i++) { x = canditer_next(); if ((bitstring_array[x] & qbmask) == qbmask) { - if (BUNappend(r, , false) != GDK_SUCCEED) { - BBPunfix(r->batCacheid); - goto sfilter_fail; - } - } - } + if (BUNappend(r, , false) != GDK_SUCCEED) { +BBPunfix(r->batCacheid); +goto sfilter_fail; + } +} +} - r->tkey = true; - r->tsorted = true; - r->trevsorted = BATcount(r) <= 1; - r->tnil = false; - r->tnonil = true; - TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT - " items took " LLFMT " usec\n", ncand, GDKusec()-t0); - TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) ); +r->tkey = true; +r->tsorted = true; +r->trevsorted = BATcount(r) <= 1; +r->tnil = false; +r->tnonil = true; +TRC_DEBUG(ACCELERATOR, "strimp prefiltering of " LLFMT + " items took " LLFMT " usec. Keeping " LLFMT + " items (%.2f%%).\n", ncand, GDKusec()-t0, r->batCount, + 100*r->batCount/(double)ncand); +TRC_DEBUG(ACCELERATOR, "r->" ALGOBATFMT "\n", ALGOBATPAR(r) ); return virtualize(r); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Define a full pipeline using the stri...
Changeset: ca39849883db for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/ca39849883db Modified Files: monetdb5/optimizer/opt_pipes.c Branch: string_imprints Log Message: Define a full pipeline using the strimps optimizer This is a copy of the default pipeline with the strimps optimizer included but it unfortunatelly crashes the server. diffs (45 lines): diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c --- a/monetdb5/optimizer/opt_pipes.c +++ b/monetdb5/optimizer/opt_pipes.c @@ -115,6 +115,41 @@ static struct PIPELINES { "optimizer.wlc();" "optimizer.garbageCollector();", "stable", NULL, 1}, + {"strimps_pipe", +"optimizer.inline();" +"optimizer.remap();" +"optimizer.costModel();" +"optimizer.coercions();" +"optimizer.aliases();" +"optimizer.evaluate();" +"optimizer.emptybind();" +"optimizer.deadcode();" /* Feb2021 update, I pushed deadcode optimizer earlier in the pipeline so it runs before mitosis, thus removing less instructions */ +"optimizer.pushselect();" +"optimizer.aliases();" +"optimizer.mitosis();" +"optimizer.mergetable();" +"optimizer.bincopyfrom();" +"optimizer.aliases();" +"optimizer.constants();" +"optimizer.commonTerms();" +"optimizer.projectionpath();" +"optimizer.deadcode();" +"optimizer.matpack();" +"optimizer.reorder();" +"optimizer.dataflow();" +"optimizer.querylog();" +"optimizer.multiplex();" +"optimizer.strimps();" +"optimizer.generator();" +"optimizer.profiler();" +"optimizer.candidates();" +//"optimizer.mask();" +"optimizer.deadcode();" +"optimizer.postfix();" +// "optimizer.jit();" awaiting the new batcalc api +"optimizer.wlc();" +"optimizer.garbageCollector();", +"stable", NULL, 1}, {"default_fast", "optimizer.defaultfast()", "stable", NULL, 1}, ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: f70db48d5bd9 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/f70db48d5bd9 Modified Files: gdk/gdk.h gdk/gdk_bbp.c gdk/gdk_strimps.c monetdb5/optimizer/opt_pipes.c monetdb5/optimizer/opt_prelude.c monetdb5/optimizer/opt_prelude.h monetdb5/optimizer/opt_wrapper.c monetdb5/optimizer/optimizer.c sql/backends/monet5/sql.c Branch: string_imprints Log Message: Merge with default diffs (truncated from 8029 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -9110,6 +9110,8 @@ [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;", "", "" ] [ "optimizer", "wlc", "pattern optimizer.wlc():str ", "OPTwrapper;", "" ] [ "optimizer", "wlc", "pattern optimizer.wlc(X_0:str, X_1:str):str ", "OPTwrapper;", "Inject the workload capture-replay primitives" ] +[ "optimizer", "wrapper", "pattern optimizer.wrapper():str ", "OPTwrapper;", "" ] +[ "optimizer", "wrapper", "pattern optimizer.wrapper(X_0:str, X_1:str):str ", "OPTwrapper;", "Fake optimizer"] [ "pcre", "imatch", "command pcre.imatch(X_0:str, X_1:str):bit ", "PCREimatch;", "Caseless Perl Compatible Regular Expression pattern matching against a string" ] [ "pcre", "index","command pcre.index(X_0:pcre, X_1:str):int ", "PCREindex;", "match a pattern, return matched position (or 0 when not found)"] [ "pcre", "match","command pcre.match(X_0:str, X_1:str):bit ", "PCREmatch;", "Perl Compatible Regular Expression pattern matching against a string" ] diff --git a/clients/Tests/MAL-signatures.stable.out.int128 b/clients/Tests/MAL-signatures.stable.out.int128 --- a/clients/Tests/MAL-signatures.stable.out.int128 +++ b/clients/Tests/MAL-signatures.stable.out.int128 @@ -12410,6 +12410,8 @@ [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;", "", "" ] [ "optimizer", "wlc", "pattern optimizer.wlc():str ", "OPTwrapper;", "" ] [ "optimizer", "wlc", "pattern optimizer.wlc(X_0:str, X_1:str):str ", "OPTwrapper;", "Inject the workload capture-replay primitives" ] +[ "optimizer", "wrapper", "pattern optimizer.wrapper():str ", "OPTwrapper;", "" ] +[ "optimizer", "wrapper", "pattern optimizer.wrapper(X_0:str, X_1:str):str ", "OPTwrapper;", "Fake optimizer"] [ "pcre", "imatch", "command pcre.imatch(X_0:str, X_1:str):bit ", "PCREimatch;", "Caseless Perl Compatible Regular Expression pattern matching against a string" ] [ "pcre", "index","command pcre.index(X_0:pcre, X_1:str):int ", "PCREindex;", "match a pattern, return matched position (or 0 when not found)"] [ "pcre", "match","command pcre.match(X_0:str, X_1:str):bit ", "PCREmatch;", "Perl Compatible Regular Expression pattern matching against a string" ] diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -773,7 +773,8 @@ void MPresetProfiler(stream *fdout); char *MSP_locate_sqlscript(const char *mod_name, bit recurse); str MSinitClientPrg(Client cntxt, const char *mod, const char *nme); void MSresetInstructions(MalBlkPtr mb, int start); -void MSresetVariables(Client cntxt, MalBlkPtr mb, MalStkPtr glb, int start); +void MSresetStack(Client cntxt, MalBlkPtr mb, MalStkPtr glb); +void MSresetVariables(MalBlkPtr mb); void MSscheduleClient(str command, str challenge, bstream *fin, stream *fout, protocol_version protocol, size_t blocksize); str MSserveClient(Client cntxt); str OIDXcreateImplementation(Client cntxt, int tpe, BAT *b, int pieces); @@ -1137,6 +1138,7 @@ const char *mdbRef; void mdbSetBreakRequest(Client cntxt, MalBlkPtr mb, str request, char cmd); const char *mergecandRef; const char *mergepackRef; +const char *mergetableRef; const char *minRef; const char *min_no_nilRef; const char *minusRef; @@ -1227,7 +1229,6 @@ const char *plusRef; const char *postludeRef; const char *preludeRef; MalStkPtr prepareMALstack(MalBlkPtr mb, int size); -int prepareMalBlk(MalBlkPtr mb, str s); void printFunction(stream *fd, MalBlkPtr mb, MalStkPtr stk, int flg); void printInstruction(stream *fd, MalBlkPtr mb, MalStkPtr stk, InstrPtr p, int flg); const char *printRef; @@ -1386,6 +1387,7 @@ const char *thetajoinRef; const char *thetaselectRef; const char *tidRef; const char *timestampRef; +const char *totalRef; void traceFunction(component_t comp, MalBlkPtr mb, MalStkPtr stk, int flg); void traceInstruction(component_t comp, MalBlkPtr mb, MalStkPtr stk, InstrPtr p, int flg); const char *transactionRef; diff --git a/common/stream/fwf.c b/common/stream/fwf.c --- a/common/stream/fwf.c
MonetDB: string_imprints - Fix the strimp optimizer pipe
Changeset: 636a2b626cfa for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/636a2b626cfa Modified Files: monetdb5/optimizer/opt_pipes.c monetdb5/optimizer/opt_strimps.c Branch: string_imprints Log Message: Fix the strimp optimizer pipe diffs (91 lines): diff --git a/monetdb5/optimizer/opt_pipes.c b/monetdb5/optimizer/opt_pipes.c --- a/monetdb5/optimizer/opt_pipes.c +++ b/monetdb5/optimizer/opt_pipes.c @@ -58,6 +58,8 @@ static struct PIPELINES { "optimizer.inline();" "optimizer.remap();" "optimizer.bincopyfrom();" +"optimizer.aliases();" +"optimizer.constants();" "optimizer.deadcode();" "optimizer.multiplex();" "optimizer.strimps();" diff --git a/monetdb5/optimizer/opt_strimps.c b/monetdb5/optimizer/opt_strimps.c --- a/monetdb5/optimizer/opt_strimps.c +++ b/monetdb5/optimizer/opt_strimps.c @@ -23,14 +23,13 @@ str OPTstrimpsImplementation(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { - int i, limit, slimit, needed =0, actions=0; + int i, limit, slimit, actions=0; + bool needed = false; // int mvcvar = -1; InstrPtr p, q, *old = mb->stmt; char buf[256]; lng usec = GDKusec(); str msg = MAL_SUCCEED; - /* int res, nvar; */ - /* ValRecord cst; */ int res; (void) pci; @@ -44,8 +43,11 @@ OPTstrimpsImplementation(Client cntxt, M for(i=0; i < limit; i++) { p = old[i]; - if (getModuleId(p) == algebraRef && getFunctionId(p) == likeselectRef) - needed = 1; + if (getModuleId(p) == algebraRef && + getFunctionId(p) == likeselectRef) { + needed = true; + break; + } } if (!needed) @@ -65,31 +67,31 @@ OPTstrimpsImplementation(Client cntxt, M /* Look for bind operations on strings, because for those we migh need strimps */ - if (getModuleId(p) == algebraRef && getFunctionId(p) == likeselectRef) { - - /* cst.vtype = TYPE_bit; */ - /* nvar = defConstant(mb, TYPE_bit, ); */ + if (getModuleId(p) == algebraRef && + getFunctionId(p) == likeselectRef) { q = newInstruction(mb, strimpsRef, strimpFilterSelectRef); res = newTmpVariable(mb, newBatType(TYPE_oid)); setDestVar(q, res); - q = addArgument(mb, q, getArg(p, 1)); - q = addArgument(mb, q, getArg(p, 2)); - q = addArgument(mb, q, getArg(p, 3)); - q = addArgument(mb, q, getArg(p, 6)); + q = pushArgument(mb, q, getArg(p, 1)); + q = pushArgument(mb, q, getArg(p, 2)); + q = pushArgument(mb, q, getArg(p, 3)); + q = pushArgument(mb, q, getArg(p, 6)); pushInstruction(mb, q); - typeChecker(cntxt->usermodule, mb, q, mb->stop-1, TRUE); + typeChecker(cntxt->usermodule, mb, q, mb->stop - 1, TRUE); - p = setArgument(mb, p, 2, getArg(q, 0)); + getArg(p, 2) = res; + // setArgument(mb, p, 2, res); actions++; + /* continue; */ } pushInstruction(mb, p); } (void)slimit; - /* for (; i < slimit; i++) */ - /* if (old[i]) */ - /* freeInstruction(old[i]); */ + for (; i < slimit; i++) + if (old[i]) + freeInstruction(old[i]); GDKfree(old); /* Defense line against incorrect plans */ ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Update comment
Changeset: 63aecf69eb6a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/63aecf69eb6a Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Update comment diffs (173 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -12,11 +12,10 @@ * A string imprint is an index that can be used as a prefilter in LIKE * queries. It has 2 components: * - * - a header of 32 or 64 string element pairs. + * - a header of 64 string element pairs. * - * - a 32 or 64 bit mask for each string in the BAT that encodes the - * presence or absence of each element of the header in the specific - * item. + * - a 64 bit mask for each string in the BAT that encodes the presence + * or absence of each element of the header in the specific item. * * A string imprint is stored in a new Heap in the BAT, aligned in 8 * byte (64 bit) words. @@ -24,40 +23,45 @@ * The first 64 bit word, the header descriptor, describes how the * header of the strimp is encoded. The least significant byte (v in the * schematic below) is the version number. The second (np) is the number - * of pairs in the header. The next 2 bytes (hs) is the size of the - * header in bytes. Finally the fifth byte is the persistence byte. The - * last 3 bytes needed to align to the 8 byte boundary should be zero, - * and are reserved for future use. + * of pairs in the header. In the current implementation this is always + * 64. The next 2 bytes (hs) is the total size of the header in + * bytes. Finally the fifth byte is the persistence byte. The last 3 + * bytes needed to align to the 8 byte boundary should be zero, and are + * reserved for future use. * * The following np bytes are the sizes of the pairs. These can have * values from 2 to 8 and are the number of bytes that the corresponding * pair takes up. Following that there are the bytes encoding the actual * pairs. * - * | v | np | hs | p | reserved | 8bytes - * | | --- - * Strimp Header | - * | psz_0 | psz_1 | ... | | - * | | --- | - * | |np bytes | - * | ... | psz_n | --- hs bytes - * | pair_0 | pair_1| | - * |...| | - * | pair_k-1 | pair_k | | - * | pair_n | | - * | | --- + * | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | 1byte | + * |---| + * | v | np | hs | p | reserved | 8bytes --- + * |---| ___ | + * | psz_0 | psz_1 | ... | | | + * | | | | + * | |np bytes | + * | | | | + * | ... | psz_n | | hs bytes + * |---| ___ | + * | pair_0| pair_1| | + * | ... | | + * | pair_k-1 | pair_k | | + * | pair_n | | + * |---| --- * * - * The bitmasks for each string in the BAT follow after this. + * The bitmasks for each string in the BAT follow after this, aligned to + * the string BAT. * * Strimp creation goes as follows: * * - Construct a histogram of the element (byte or character) pairs for * all the strings in the BAT. * - * - Take the 32/64 most frequent pairs as the Strimp Header. + * - Take the 64 most frequent pairs as the Strimp Header. * - * - For each string in the bat construct a 32/64 bit mask that encodes + * - For each string in the bat construct a 64 bit mask that encodes * the presence or absence of each member of the header in the string. */ @@ -80,8 +84,8 @@ #define NPAIRS(d) (((d) >> 8) & 0xff) #define HSIZE(d) (((d) >> 16) & 0x) -#undef UTF8STRINGS
MonetDB: string_imprints - Merge with default
ct most users not to notice this change, as + such schema changes aren't usually done concurrently. + +* Tue Jul 20 2021 Sjoerd Mullender - 11.41.1-20210723 +- clients: The MonetDB stethoscope has been removed. There is now a separate + package available with PIP (monetdb_stethoscope) or as an RPM or DEB + package (stethoscope) from the monetdb.org repository. + +* Tue Jul 20 2021 Sjoerd Mullender - 11.41.1-20210723 +- gdk: A new type, called msk, was introduced. This is a bit mask type. + In a bat with type msk, each row occupies a single bit, so 8 rows are + stored in a single byte. There is no NULL value for this type. +- gdk: The function of the BAT iterator (type BATiter, function bat_iterator) + has been expanded. The iterator now contains more information about + the BAT, and it contains a pointer to the heaps (theap and tvheap) + that are stable, at least in the sense that they will remain available + even when parallel threads update the BAT and cause those heaps to grow + (and therefore possibly move in memory). A call to bat_iterator must + now be accompanied by a call to bat_iterator_end. + +* Mon Jun 7 2021 Sjoerd Mullender - 11.41.1-20210723 +- monetdb5: When using the --in-memory option, mserver5 will run completely in + memory, i.e. not create a database on disk. The server can still be + connected to using the name of the in-memory database. This name is + "in-memory". + +* Tue May 11 2021 Sjoerd Mullender - 11.41.1-20210723 +- sql: There is now a function sys.current_sessionid() to return the session + ID of the current session. This ID corresponds with the sessionid in + the sys.queue() result. + +* Mon May 10 2021 Panagiotis Koutsourakis - 11.41.1-20210723 +- merovingian: Deprecate `profilerstart` and `profilerstop` commands. Since + stethoscope is a separate project (https://github.com/MonetDBSolutions/monetdb-pystethoscope) + the installation directory is not standard anymore. `profilerstart` and + `profilerstop` commands assume that the stethoscope executable is in the + same directory as `mserver5`. This is no longer necessarily true since + stethoscope can now be installed in a python virtual environment. The + commands still work if stethoscope is installed using the official + MonetDB installers, or if a symbolic link is created in the directory + where `mserver5` is located. + +* Fri May 7 2021 Sjoerd Mullender - 11.41.1-20210723 +- odbc: A typo that made the SQLSpecialColumns function unusable was fixed. + +* Mon May 3 2021 Pedro Ferreira - 11.41.1-20210723 +- sql: Merge statements could not produce correct results on complex join + conditions, so a renovation was made. As a consequence, subqueries + now have to be disabled on merge join conditions. + +* Mon May 3 2021 svetlin - 11.41.1-20210723 +- sql: preserve in-query comments + +* Mon May 3 2021 Sjoerd Mullender - 11.41.1-20210723 +- merovingian: The exittimeout value can now be set to a negative value (e.g. -1) to + indicate that when stopping the dbfarm (using monetdbd stop dbfarm), + any mserver5 processes are to be sent a termination signal and then + waited for until they terminate. In addition, if exittimeout is greater + than zero, the mserver5 processes are sent a SIGKILL signal after the + specified timeout and the managing monetdbd is sent a SIGKILL signal + after another five seconds (if it didn't terminate already). The old + situation was that the managing monetdbd process was sent a SIGKILL + after 30 seconds, and the mserver5 processes that hadn't terminated + yet would be allowed to continue their termination sequence. + +* Mon May 3 2021 Sjoerd Mullender - 11.41.1-20210723 +- gdk: Implemented function BUNreplacemultiincr to replace multiple values + in a BAT in one go, starting at a given position. +- gdk: Implemented new function BUNreplacemulti to replace multiple values + in a BAT in one go, at the given positions. +- gdk: Removed function BUNinplace, just use BUNreplace, and check whether + the BAT argument is of type TYPE_void before calling if you don't + want to materialize. + +* Mon May 3 2021 Pedro Ferreira - 11.41.1-20210723 +- sql: Use of CTEs inside UPDATE and DELETE statements are now more ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Make sure the strimp is created befor...
Changeset: 87cd6fa29635 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/87cd6fa29635 Modified Files: monetdb5/modules/mal/strimps.c Branch: string_imprints Log Message: Make sure the strimp is created before filtering diffs (12 lines): diff --git a/monetdb5/modules/mal/strimps.c b/monetdb5/modules/mal/strimps.c --- a/monetdb5/modules/mal/strimps.c +++ b/monetdb5/modules/mal/strimps.c @@ -124,6 +124,8 @@ PATstrimpFilterSelect(Client cntxt, MalB assert(s->ttype == TYPE_void); + if(STRMPcreate(b) != GDK_SUCCEED) + throw(MAL, "strimps.strimpfilter", SQLSTATE(HY002) "strimp creation failed"); pat = *getArgReference_str(stk, pci, 3); if ((ob = STRMPfilter(b, pat)) == NULL) { ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Only one thread should compute the he...
Changeset: 8671d66745fb for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/8671d66745fb Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Only one thread should compute the header and allocate the heap diffs (261 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -216,7 +216,7 @@ STRMPchoosePairs(PairHistogramElem *hist const size_t cmin_max = STRIMP_HEADER_SIZE - 1; size_t hidx; - TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); for(i = 0; i < hist_size; i++) { if (max_counts[cmin_max] < hist[i].cnt) { @@ -234,7 +234,7 @@ STRMPchoosePairs(PairHistogramElem *hist cp[i].psize = hist[indices[i]].p->psize; } - TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0); + TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0); } static bool @@ -249,7 +249,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair PairIterator pi, *pip; CharPair cp, *cpp; - TRC_DEBUG_IF(ALGO) t0 = GDKusec(); + TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); hlen = STRIMP_HISTSIZE; if ((hist = (PairHistogramElem *)GDKmalloc(hlen*sizeof(PairHistogramElem))) == NULL) { // TODO handle error @@ -317,7 +317,7 @@ STRMPbuildHeader(BAT *b, CharPair *hpair } GDKfree(hist); - TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0); + TRC_DEBUG(ACCELERATOR, LLFMT " usec\n", GDKusec() - t0); return true; } @@ -333,39 +333,48 @@ STRMPcreateStrimpHeap(BAT *b) CharPair hpairs[STRIMP_HEADER_SIZE]; const char *nme; - - STRMPbuildHeader(b, hpairs); /* Find the header pairs */ - sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor */ - for(i = 0; i < STRIMP_HEADER_SIZE; i++) { - sz += hpairs[i].psize; - } + if (b->tstrimps == NULL) { + MT_lock_set(>batIdxLock); + /* Make sure no other thread got here first */ +if (b->tstrimps == NULL) { + STRMPbuildHeader(b, hpairs); /* Find the header pairs */ + sz = 8 + STRIMP_HEADER_SIZE; /* add 8-bytes for the descriptor */ + for (i = 0; i < STRIMP_HEADER_SIZE; i++) { + sz += hpairs[i].psize; + } - nme = GDKinmemory(b->theap->farmid) ? ":memory:" : BBP_physical(b->batCacheid); - /* Allocate the strimps heap */ - if ((r = GDKzalloc(sizeof(Strimps))) == NULL || - (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 || - strconcat_len(r->strimps.filename, sizeof(r->strimps.filename), - nme, ".tstrimps", NULL) >= sizeof(r->strimps.filename) || - HEAPalloc(>strimps, BATcount(b)*sizeof(uint64_t) + sz, sizeof(uint8_t), 0) != GDK_SUCCEED) { - GDKfree(r); - return NULL; + nme = GDKinmemory(b->theap->farmid) ? ":memory:" : BBP_physical(b->batCacheid); + /* Allocate the strimps heap */ + if ((r = GDKzalloc(sizeof(Strimps))) == NULL || + (r->strimps.farmid = BBPselectfarm(b->batRole, b->ttype, strimpheap)) < 0 || + strconcat_len(r->strimps.filename, sizeof(r->strimps.filename), nme, + ".tstrimps", NULL) >= sizeof(r->strimps.filename) || + HEAPalloc(>strimps, BATcount(b) * sizeof(uint64_t) + sz, sizeof(uint8_t), 0) != GDK_SUCCEED) { + GDKfree(r); + MT_lock_unset(>batIdxLock); + return NULL; + } + + descriptor = STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) << 8 | ((uint64_t)sz) << 16; + + ((uint64_t *)r->strimps.base)[0] = descriptor; + r->sizes_base = h1 = (uint8_t *)r->strimps.base + 8; + r->pairs_base = h2 = (uint8_t *)h1 + STRIMP_HEADER_SIZE; + + for (i = 0; i < STRIMP_HEADER_SIZE; i++) { + *(h1 + i) = hpairs[i].psize; + memcpy(h2, hpairs[i].pbytes, hpairs[i].psize); + h2 += hpairs[i].psize; + } + r->strimps_base = h2; + r->strimps.free = sz; + + b->tstrimps = r; + b->batDirtydesc = true; + } + MT_lock_unset(>batIdxLock); } - - descriptor = STRIMP_VERSION | ((uint64_t)STRIMP_HEADER_SIZE) << 8 | ((uint64_t)sz) << 16; - - ((uint64_t *)r->strimps.base)[0] = descriptor; - r->sizes_base = h1 = (uint8_t *)r->strimps.base
MonetDB: string_imprints - Merge with default
Changeset: 39e30b8b6392 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/39e30b8b6392 Modified Files: gdk/gdk_bbp.c gdk/gdk_private.h monetdb5/modules/mal/batcalc.c monetdb5/modules/mal/pcre.c sql/backends/monet5/sql.c Branch: string_imprints Log Message: Merge with default diffs (truncated from 9006 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -718,29 +718,29 @@ [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:any_1], X_1:bat[:any_1], X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:any_1], X_1:bat[:any_1], X_2:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int]):bat[:bit] ", "CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int]):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng]):bat[:bit] ", "CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ", "CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:int], X_2:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng]):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht]):bat[:bit] ", "CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ", "CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:lng], X_2:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht]):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], X_2:bat[:oid], X_3:bat[:oid]):bat[:bit] ","CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:int):bat[:bit] ","CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], X_2:bat[:oid], X_3:bat[:oid], X_4:bit):bat[:bit] ", "CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:bat[:sht], X_2:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:int):bat[:bit] ","CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:int, X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:int, X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:lng):bat[:bit] ","CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:int, X_2:bat[:oid], X_3:bit):bat[:bit] ","CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:int, X_2:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:lng):bat[:bit] ","CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, X_2:bat[:oid]):bat[:bit] ", "CMDbatNE;","" ] -[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:sht):bat[:bit] ","CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, X_2:bat[:oid], X_3:bit):bat[:bit] ","CMDbatNE;","" ] +[ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:lng, X_2:bit):bat[:bit] ", "CMDbatNE;","" ] [ "batcalc", "!=", "pattern batcalc.!=(X_0:bat[:bte], X_1:sht):bat[:bit] ","CMDbatNE;","" ] [ "batcalc",
MonetDB: string_imprints - Avoid dividing in favor of multiplying
Changeset: ecf3c1a4555f for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/ecf3c1a4555f Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Avoid dividing in favor of multiplying diffs (20 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -400,7 +400,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) #define STRIMP_COMPLETE(b) \ b->tstrimps != NULL && \ (b->tstrimps == (Strimps *)1 || \ -(b->tstrimps->strimps.free - ((char *)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount) +(b->tstrimps->strimps.free - ((char *)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base)) == b->batCount*sizeof(uint64_t)) static bool BATcheckstrimps(BAT *b) @@ -485,7 +485,6 @@ BATcheckstrimps(BAT *b) * not null and the number of bitstrings is equal to the bat * count. */ - // assert(!b->tstrimps || (b->tstrimps->strimps.free - HSIZE(((uint64_t *)b->tstrimps->strimps.base)[0]))/sizeof(uint64_t) <= b->batCount); ret = STRIMP_COMPLETE(b); if (ret) { TRC_DEBUG(ACCELERATOR, ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Name pointer more appropriatelly
Changeset: d34debd8ca7e for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d34debd8ca7e Modified Files: gdk/gdk_private.h gdk/gdk_strimps.c Branch: string_imprints Log Message: Name pointer more appropriatelly diffs (62 lines): diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -418,8 +418,8 @@ struct Strimps { Heap strimps; uint8_t *sizes_base;/* pointer into strimps heap (pair sizes) */ uint8_t *pairs_base;/* pointer into strimps heap (pairs start) */ - void *strimps_base; /* pointer into strimps heap (strimps start) */ - /* strimps_base is a pointer to either a uint32_t or a uint64_t */ + void *bitstrings_base; /* pointer into strimps heap (bitstrings start) */ + /* bitstrings_base is a pointer to uint64_t */ }; typedef struct { diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -381,7 +381,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) memcpy(h2, hpairs[i].pbytes, psize); h2 += psize; } - r->strimps_base = h2; + r->bitstrings_base = h2; r->strimps.free = sz; b->tstrimps = r; @@ -400,7 +400,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) #define STRIMP_COMPLETE(b) \ b->tstrimps != NULL && \ (b->tstrimps == (Strimps *)1 || \ -(b->tstrimps->strimps.free - ((char *)b->tstrimps->strimps_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount) +(b->tstrimps->strimps.free - ((char *)b->tstrimps->bitstrings_base - b->tstrimps->strimps.base))/sizeof(uint64_t) == b->batCount) static bool BATcheckstrimps(BAT *b) @@ -461,7 +461,7 @@ BATcheckstrimps(BAT *b) && HEAPload(>strimps, nme, "tstrimps", false) == GDK_SUCCEED) { hp->sizes_base = (uint8_t *)hp->strimps.base + 8; /* sizes just after the descriptor */ hp->pairs_base = hp->sizes_base + npairs; /* pairs just after the offsets */ - hp->strimps_base = hp->strimps.base + hsize;/* bitmasks just after the pairs */ + hp->bitstrings_base = hp->strimps.base + hsize;/* bitmasks just after the pairs */ close(fd); hp->strimps.parentid = b->batCacheid; @@ -539,7 +539,7 @@ STRMPfilter(BAT *b, BAT *s, char *q) * (see the macro isIgnored). */ qbmask = STRMPmakebitstring(q, strmps); - bitstring_array = (uint64_t *)strmps->strimps_base; + bitstring_array = (uint64_t *)strmps->bitstrings_base; for (i = 0; i < ncand; i++) { x = canditer_next(); @@ -672,7 +672,7 @@ STRMPcreate(BAT *b, BAT *s) if ((h = STRMPcreateStrimpHeap(pb, s)) == NULL) { return GDK_FAIL; } - dh = (uint64_t *)h->strimps_base + b->hseqbase; + dh = (uint64_t *)h->bitstrings_base + b->hseqbase; ncand = canditer_init(, b, s); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge heads
Changeset: b38e5d23af12 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/b38e5d23af12 Modified Files: sql/test/emptydb/Tests/check.stable.out sql/test/emptydb/Tests/check.stable.out.32bit sql/test/emptydb/Tests/check.stable.out.int128 Branch: string_imprints Log Message: Merge heads diffs (truncated from 358 to 300 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -9123,6 +9123,7 @@ [ "optimizer", "mergetable", "pattern optimizer.mergetable(X_0:str, X_1:str):str ", "OPTwrapper;", "Resolve the multi-table definitions" ] [ "optimizer", "minimal_fast", "function optimizer.minimal_fast():void;", "", "" ] [ "optimizer", "minimal_pipe", "function optimizer.minimal_pipe():void;", "", "" ] +[ "optimizer", "minimal_strimps_pipe", "function optimizer.minimal_strimps_pipe():void;", "", "" ] [ "optimizer", "minimalfast", "pattern optimizer.minimalfast():str ", "OPTwrapper;", "" ] [ "optimizer", "minimalfast", "pattern optimizer.minimalfast(X_0:str, X_1:str):str ", "OPTwrapper;", "Fast compound minimal optimizer pipe" ] [ "optimizer", "mitosis", "pattern optimizer.mitosis():str ", "OPTwrapper;", "" ] @@ -9156,6 +9157,9 @@ [ "optimizer", "reorder", "pattern optimizer.reorder():str ", "OPTwrapper;", "" ] [ "optimizer", "reorder", "pattern optimizer.reorder(X_0:str, X_1:str):str ", "OPTwrapper;", "Reorder by dataflow dependencies" ] [ "optimizer", "sequential_pipe", "function optimizer.sequential_pipe():void;", "", "" ] +[ "optimizer", "strimps", "pattern optimizer.strimps():str ", "OPTwrapper;", "" ] +[ "optimizer", "strimps", "pattern optimizer.strimps(X_0:str, X_1:str):str ", "OPTwrapper;", "Use strimps index if appropriate" ] +[ "optimizer", "strimps_pipe", "function optimizer.strimps_pipe():void;", "", "" ] [ "optimizer", "volcano", "pattern optimizer.volcano():str ", "OPTwrapper;", "" ] [ "optimizer", "volcano", "pattern optimizer.volcano(X_0:str, X_1:str):str ", "OPTwrapper;", "Simulate volcano style execution" ] [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;", "", "" ] @@ -9305,6 +9309,7 @@ [ "sql", "covariancep", "pattern sql.covariancep(X_0:lng, X_1:lng, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcovar_pop;", "return the covariance population value of groups" ] [ "sql", "covariancep", "pattern sql.covariancep(X_0:sht, X_1:sht, X_2:bit, X_3:bit, X_4:int, X_5:oid, X_6:oid):dbl ", "SQLcovar_pop;", "return the covariance population value of groups" ] [ "sql", "createorderindex", "unsafe pattern sql.createorderindex(X_0:str, X_1:str, X_2:str):void ", "sql_createorderindex;","Instantiate the order index on a column" ] +[ "sql", "createstrimps","unsafe pattern sql.createstrimps(X_0:str, X_1:str, X_2:str):void ","sql_createstrimps;", "Instantiate the strimps index on a column" ] [ "sql", "cume_dist","pattern sql.cume_dist(X_0:any_1, X_1:bit, X_2:bit):dbl ", "SQLcume_dist;","return the accumulated distribution of the number of rows per group to the total number of partition rows" ] [ "sql", "current_time", "pattern sql.current_time():daytime ", "SQLcurrent_daytime;", "Get the clients current daytime" ] [ "sql", "current_timestamp","pattern sql.current_timestamp():timestamp ", "SQLcurrent_timestamp;","Get the clients current timestamp" ] @@ -9592,6 +9597,10 @@ [ "streams", "readStr", "unsafe command streams.readStr(X_0:streams):str ", "mnstr_read_stringwrap;", "read string data from the stream" ] [ "streams", "writeInt", "unsafe command streams.writeInt(X_0:streams, X_1:int):void ", "mnstr_writeIntwrap;", "write data on the stream" ] [ "streams", "writeStr", "unsafe command streams.writeStr(X_0:streams, X_1:str):void ", "mnstr_write_stringwrap;", "write data on the stream" ] +[ "strimps", "mkstrimp", "pattern strimps.mkstrimp(X_0:bat[:str], X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT"] +[ "strimps", "strimpfilter", "pattern strimps.strimpfilter(X_0:str, X_1:str):bit ", "PATstrimpFilter;", "" ] +[ "strimps", "strimpfilterjoin", "pattern strimps.strimpfilterjoin(X_0:str, X_1:any, X_2:any, X_3:any, X_4:any, X_5:any) (X_6:bat[:oid], X_7:bat[:str]) ", "PATstrimpFilter;", "" ] +[ "strimps", "strimpfilterselect", "pattern strimps.strimpfilterselect(X_0:bat[:str], X_1:bat[:oid], X_2:str, X_3:bit):bat[:oid] ","PATstrimpFilterSelect;", "" ] [
MonetDB: string_imprints - Merge with default
Changeset: 3fc32e309cdc for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/3fc32e309cdc Modified Files: gdk/gdk_bbp.c Branch: string_imprints Log Message: Merge with default diffs (truncated from 2202 to 300 lines): diff --git a/CMakeLists.txt b/CMakeLists.txt --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,8 +53,7 @@ test_big_endian(IS_BIG_ENDIAN) include(monetdb-functions) include(monetdb-findpackages) include(monetdb-toolchain) -monetdb_default_toolchain() -#monetdb_default_compiler_options() +monetdb_default_compiler_options() include(monetdb-defines) monetdb_hg_revision() diff --git a/clients/mapiclient/dump.c b/clients/mapiclient/dump.c --- a/clients/mapiclient/dump.c +++ b/clients/mapiclient/dump.c @@ -382,15 +382,16 @@ dump_foreign_keys(Mapi mid, const char * if (tname != NULL) { char *s = sescape(schema); char *t = sescape(tname); + if (s == NULL || t == NULL) { + free(s); + free(t); + goto bailout; + } maxquerylen = 1024 + strlen(t) + strlen(s); query = malloc(maxquerylen); - if (s == NULL || t == NULL || query == NULL) { - if (s) - free(s); - if (t) - free(t); - if (query) - free(query); + if (query == NULL) { + free(s); + free(t); goto bailout; } snprintf(query, maxquerylen, diff --git a/clients/odbc/driver/ODBCUtil.c b/clients/odbc/driver/ODBCUtil.c --- a/clients/odbc/driver/ODBCUtil.c +++ b/clients/odbc/driver/ODBCUtil.c @@ -1255,6 +1255,7 @@ ODBCTranslateSQL(ODBCDbc *dbc, const SQL strncpy(q, nquery, pr); for (r = func->repl; *r; r++) { if (*r == '\1' || *r == '\2' || *r == '\3' || *r == '\4') { + assert(*r <= func->nargs); if (args[*r - 1].argstart[0] == '\'') q[pr++] = 'r'; strncpy(q + pr, args[*r - 1].argstart, args[*r - 1].arglen); diff --git a/clients/odbc/driver/SQLTables.c b/clients/odbc/driver/SQLTables.c --- a/clients/odbc/driver/SQLTables.c +++ b/clients/odbc/driver/SQLTables.c @@ -82,6 +82,8 @@ MNDBTables(ODBCStmt *stmt, "cast(null as varchar(1)) as remarks " "from sys.env() e " "where e.name = 'gdk_dbname'"); + if (query == NULL) + goto nomem; } else if (NameLength1 == 0 && NameLength3 == 0 && SchemaName && @@ -96,6 +98,8 @@ MNDBTables(ODBCStmt *stmt, * schema remarks */ "cast(null as varchar(1)) as remarks " "from sys.schemas order by table_schem"); + if (query == NULL) + goto nomem; } else if (NameLength1 == 0 && NameLength2 == 0 && NameLength3 == 0 && @@ -108,6 +112,8 @@ MNDBTables(ODBCStmt *stmt, "table_type_name as table_type, " "cast(null as varchar(1)) as remarks " "from sys.table_types order by table_type"); + if (query == NULL) + goto nomem; } else { /* no special case argument values */ size_t querylen; @@ -241,8 +247,6 @@ MNDBTables(ODBCStmt *stmt, free(sch); if (tab) free(tab); - if (query) - free(query); /* Memory allocation error */ addStmtError(stmt, "HY001", NULL, 0); return SQL_ERROR; diff --git a/cmake/monetdb-functions.cmake b/cmake/monetdb-functions.cmake --- a/cmake/monetdb-functions.cmake +++ b/cmake/monetdb-functions.cmake @@ -9,73 +9,60 @@ function(monetdb_hg_revision) # Get the current version control revision if(EXISTS "${CMAKE_SOURCE_DIR}/.hg_archival.txt") -execute_process(COMMAND "sed" "-n" "s/^node: \\([0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f][0-9a-f]\\).*/\\1/p" ".hg_archival.txt" WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}" RESULT_VARIABLE HG_RETURN_CODE - OUTPUT_VARIABLE HG_OUPUT_RES OUTPUT_STRIP_TRAILING_WHITESPACE) -if(HG_RETURN_CODE EQUAL 0 AND HG_OUPUT_RES) - set(MERCURIAL_ID
MonetDB: Jan2022 - Merge heads
Changeset: 10686cbf3739 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/10686cbf3739 Branch: Jan2022 Log Message: Merge heads diffs (truncated from 716 to 300 lines): diff --git a/MonetDB.spec b/MonetDB.spec --- a/MonetDB.spec +++ b/MonetDB.spec @@ -616,7 +616,11 @@ This package contains files needed to de %files SQL-server5-devel %defattr(-,root,root) +%{_includedir}/monetdb/exception_buffer.h +%{_includedir}/monetdb/opt_backend.h +%{_includedir}/monetdb/rel_*.h %{_includedir}/monetdb/sql*.h +%{_includedir}/monetdb/store_*.h %package embedded Summary: MonetDB as an embedded library diff --git a/NT/mksqlwxs.py b/NT/mksqlwxs.py --- a/NT/mksqlwxs.py +++ b/NT/mksqlwxs.py @@ -179,7 +179,7 @@ def main(): print(r'') print(r' ') id = comp(extend, id, 16, - sorted([r'include\monetdb\{}'.format(x) for x in filter(lambda x: (x.startswith('gdk') or x.startswith('monet') or x.startswith('mal') or x.startswith('sql')) and x.endswith('.h'), os.listdir(os.path.join(sys.argv[3], 'include', 'monetdb')))] + + sorted([r'include\monetdb\{}'.format(x) for x in filter(lambda x: (x.startswith('gdk') or x.startswith('monet') or x.startswith('mal') or x.startswith('sql') or x.startswith('rel') or x.startswith('store') or x.startswith('exception') or x.startswith('opt_backend')) and x.endswith('.h'), os.listdir(os.path.join(sys.argv[3], 'include', 'monetdb')))] + [r'include\monetdb\copybinary.h', r'include\monetdb\mapi.h', r'include\monetdb\matomic.h', diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -1433,6 +1433,27 @@ const char *wlrRef; Workingset workingset[THREADS]; const char *zero_or_oneRef; +# monetdbe +char *monetdbe_append(monetdbe_database dbhdl, const char *schema, const char *table, monetdbe_column **input, size_t column_count); +char *monetdbe_bind(monetdbe_statement *stmt, void *data, size_t parameter_nr); +char *monetdbe_cleanup_result(monetdbe_database dbhdl, monetdbe_result *result); +char *monetdbe_cleanup_statement(monetdbe_database dbhdl, monetdbe_statement *stmt); +int monetdbe_close(monetdbe_database db); +char *monetdbe_dump_database(monetdbe_database dbhdl, const char *backupfile); +char *monetdbe_dump_table(monetdbe_database dbhdl, const char *schema_name, const char *table_name, const char *backupfile); +char *monetdbe_error(monetdbe_database db); +char *monetdbe_execute(monetdbe_statement *stmt, monetdbe_result **result, monetdbe_cnt *affected_rows); +char *monetdbe_get_autocommit(monetdbe_database dbhdl, int *result); +char *monetdbe_get_columns(monetdbe_database dbhdl, const char *schema_name, const char *table_name, size_t *column_count, monetdbe_column **columns); +int monetdbe_in_transaction(monetdbe_database dbhdl); +const void *monetdbe_null(monetdbe_database dbhdl, monetdbe_types t); +int monetdbe_open(monetdbe_database *db, char *url, monetdbe_options *opts); +char *monetdbe_prepare(monetdbe_database dbhdl, char *query, monetdbe_statement **stmt, monetdbe_result **result); +char *monetdbe_query(monetdbe_database dbhdl, char *query, monetdbe_result **result, monetdbe_cnt *affected_rows); +char *monetdbe_result_fetch(monetdbe_result *mres, monetdbe_column **res, size_t column_index); +char *monetdbe_set_autocommit(monetdbe_database dbhdl, int value); +const char *monetdbe_version(void); + # stream stream *block_stream(stream *s); stream *block_stream2(stream *s, size_t bufsiz, compression_method comp); diff --git a/debian/monetdb5-sql-dev.install b/debian/monetdb5-sql-dev.install --- a/debian/monetdb5-sql-dev.install +++ b/debian/monetdb5-sql-dev.install @@ -1,1 +1,5 @@ +debian/tmp/usr/include/monetdb/exception_buffer.h usr/include/monetdb +debian/tmp/usr/include/monetdb/opt_backend.h usr/include/monetdb +debian/tmp/usr/include/monetdb/rel_*.h usr/include/monetdb debian/tmp/usr/include/monetdb/sql*.h usr/include/monetdb +debian/tmp/usr/include/monetdb/store_*.h usr/include/monetdb diff --git a/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128 b/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128 --- a/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128 +++ b/sql/test/emptydb-previous-upgrade-chain-hge/Tests/upgrade.stable.out.ppc64.int128 @@ -5200,6 +5200,7 @@ drop view sys.dump_create_roles; drop view sys.describe_functions; drop view sys.describe_partition_tables; drop view sys.describe_privileges; +drop view sys.fully_qualified_functions; drop view sys.describe_comments; drop view sys.describe_tables; drop view sys.describe_sequences; @@ -5280,6 +5281,28 @@ CREATE VIEW sys.describe_tables AS AND s.id = t.schema_id AND ts.table_type_id = t.type AND s.name <>
MonetDB: Jan2022 - Add changelog entry
Changeset: d5cf7d95ecf8 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/d5cf7d95ecf8 Modified Files: gdk/ChangeLog.Jan2022 Branch: Jan2022 Log Message: Add changelog entry diffs (22 lines): diff --git a/gdk/ChangeLog.Jan2022 b/gdk/ChangeLog.Jan2022 --- a/gdk/ChangeLog.Jan2022 +++ b/gdk/ChangeLog.Jan2022 @@ -1,6 +1,18 @@ # ChangeLog file for GDK # This file is updated with Maddlog +* Mon Jan 3 2022 Panagiotis Koutsourakis +- Implement string imprints (strimps for short) a pre-filter structure + for strings in order to accelerate LIKE queries. If a strimp exists + for a specific string column the strings are pre-filtered, rejecting + strings that cannot possibly match, before the more expensive and + accurate matching algorithms run. Strimps are created automatically + or using 'sys.strimp_create' with arguments the names of the schema, + table and column. Automatic strimp creation is controlled by two + user settable gdk options: 'gdk_use_strimps' (default value "no") and + 'gdk_strimps_threshold' (default value 1.000.000). See the manual for + more details. + * Wed Aug 11 2021 Sjoerd Mullender - Many (most) low level functions that could take a long time (such as BATjoin) can now be aborted with a timeout. When the function takes too ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: Jan2022 - Make strimps opt-in
Changeset: 4c84a9be8cc8 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/4c84a9be8cc8 Modified Files: monetdb5/modules/mal/pcre.c Branch: Jan2022 Log Message: Make strimps opt-in The user needs to specify --set gdk_use_strimps=yes in order to enable strimp creation. diffs (12 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1870,7 +1870,7 @@ PCRElikeselect(bat *ret, const bat *bid, str msg = MAL_SUCCEED; char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; - bool use_strimps = !GDKgetenv_istext("gdk_use_strimps", "no"), with_strimps = false; + bool use_strimps = GDKgetenv_isyes("gdk_use_strimps"), with_strimps = false; if ((b = BATdescriptor(*bid)) == NULL) { msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: Jan2022 - Make the strimp creation threshold user defined
Changeset: a5a8ed8f7f73 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/a5a8ed8f7f73 Modified Files: monetdb5/modules/mal/pcre.c Branch: Jan2022 Log Message: Make the strimp creation threshold user defined The parameter gdk_strimps_threshold specifies how many entries the string bat should have before a strimp is created. The defalt value is 1.000.000. diffs (20 lines): diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -1871,6 +1871,7 @@ PCRElikeselect(bat *ret, const bat *bid, char *ppat = NULL; bool use_re = false, use_strcmp = false, empty = false; bool use_strimps = GDKgetenv_isyes("gdk_use_strimps"), with_strimps = false; + BUN strimp_creation_threshold = GDKgetenv_int("gdk_strimps_threshold", 100); if ((b = BATdescriptor(*bid)) == NULL) { msg = createException(MAL, "algebra.likeselect", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); @@ -1893,7 +1894,7 @@ PCRElikeselect(bat *ret, const bat *bid, * A better solution is to run the PCRElikeselect as a LIKE query with * strimps and return the complement of the result. */ - if (!empty && use_strimps && BATcount(b) >= STRIMP_CREATION_THRESHOLD && !*anti) { + if (!empty && use_strimps && BATcount(b) >= strimp_creation_threshold && !*anti) { BAT *tmp_s = NULL; if (STRMPcreate(b, NULL) == GDK_SUCCEED && (tmp_s = STRMPfilter(b, s, *pat))) { if (s) ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Add function to append bitstring to a...
Changeset: 78f1dd084b9a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/78f1dd084b9a Modified Files: gdk/gdk_strimps.c gdk/gdk_strimps.h Branch: string_imprints Log Message: Add function to append bitstring to a strimp diffs (69 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -716,6 +716,7 @@ STRMPcreate(BAT *b, BAT *s) { } bat_iterator_end(); + r->strimps.free += ncand*sizeof(uint64_t); pb->tstrimps = r; pb->batDirtydesc = true; persistStrimp(pb); @@ -726,6 +727,48 @@ STRMPcreate(BAT *b, BAT *s) { return GDK_SUCCEED; } +gdk_return +STRMPappendBitstring(BAT *b, const str s) { + lng t0 = 0; + BAT *pb; + + TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); + if (ATOMstorage(b->ttype) != TYPE_str) { + GDKerror("Cannot manipulate strimps index for non string bats\n"); + return GDK_FAIL; + } + + if (VIEWtparent(b)) { + pb = BBP_cache(VIEWtparent(b)); + assert(pb); + } else { + pb = b; + } + + if (!BATcheckstrimps(pb)) { + GDKerror("Strimp missing, cannot append value\n"); + return GDK_FAIL; + } + MT_lock_set(>batIdxLock); + // Check that there is space in the heap + if (pb->tstrimps->strimps.free < pb->tstrimps->strimps.size + sizeof(uint64_t)) { + pb->tstrimps->strimps.base[pb->tstrimps->strimps.free] = STRMPmakebitstring(s, pb->tstrimps); + pb->tstrimps->strimps.free += sizeof(uint64_t); + } + else { + // TODO reallocate buffer + } + + // TODO increase reconstruction counter if + // reconstruction counter is larger than a threshold + // recompute the strimp from scratch. + + MT_lock_unset(>batIdxLock); + + TRC_DEBUG(ACCELERATOR, "appending to strimp took " LLFMT " usec\n", GDKusec()-t0); + return GDK_SUCCEED; +} + /* Parallel creation. does not wok*/ #if 0 /* Creates the heap for a string imprint. Returns NULL on failure. This diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h --- a/gdk/gdk_strimps.h +++ b/gdk/gdk_strimps.h @@ -45,4 +45,5 @@ typedef struct { // gdk_export gdk_return STRMP_make_header(BAT *b); gdk_export gdk_return STRMPcreate(BAT *b, BAT *s); gdk_export BAT *STRMPfilter(BAT *b, BAT *s, const str q); +gdk_export gdk_return STRMPappendBitstring(BAT *b, const str s); #endif /* _GDK_STRIMPS_H_ */ ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge heads
Changeset: 2f528186e330 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/2f528186e330 Branch: string_imprints Log Message: Merge heads diffs (294 lines): diff --git a/clients/Tests/MAL-signatures.stable.out b/clients/Tests/MAL-signatures.stable.out --- a/clients/Tests/MAL-signatures.stable.out +++ b/clients/Tests/MAL-signatures.stable.out @@ -9132,7 +9132,6 @@ [ "optimizer", "mergetable", "pattern optimizer.mergetable(X_0:str, X_1:str):str ", "OPTwrapper;", "Resolve the multi-table definitions" ] [ "optimizer", "minimal_fast", "function optimizer.minimal_fast():void;", "", "" ] [ "optimizer", "minimal_pipe", "function optimizer.minimal_pipe():void;", "", "" ] -[ "optimizer", "minimal_strimps_pipe", "function optimizer.minimal_strimps_pipe():void;", "", "" ] [ "optimizer", "minimalfast", "pattern optimizer.minimalfast():str ", "OPTwrapper;", "" ] [ "optimizer", "minimalfast", "pattern optimizer.minimalfast(X_0:str, X_1:str):str ", "OPTwrapper;", "Fast compound minimal optimizer pipe" ] [ "optimizer", "mitosis", "pattern optimizer.mitosis():str ", "OPTwrapper;", "" ] @@ -9168,7 +9167,6 @@ [ "optimizer", "sequential_pipe", "function optimizer.sequential_pipe():void;", "", "" ] [ "optimizer", "strimps", "pattern optimizer.strimps():str ", "OPTwrapper;", "" ] [ "optimizer", "strimps", "pattern optimizer.strimps(X_0:str, X_1:str):str ", "OPTwrapper;", "Use strimps index if appropriate" ] -[ "optimizer", "strimps_pipe", "function optimizer.strimps_pipe():void;", "", "" ] [ "optimizer", "volcano", "pattern optimizer.volcano():str ", "OPTwrapper;", "" ] [ "optimizer", "volcano", "pattern optimizer.volcano(X_0:str, X_1:str):str ", "OPTwrapper;", "Simulate volcano style execution" ] [ "optimizer", "volcano_pipe", "function optimizer.volcano_pipe():void;", "", "" ] @@ -9609,7 +9607,7 @@ [ "streams", "readStr", "unsafe command streams.readStr(X_0:streams):str ", "mnstr_read_stringwrap;", "read string data from the stream" ] [ "streams", "writeInt", "unsafe command streams.writeInt(X_0:streams, X_1:int):void ", "mnstr_writeIntwrap;", "write data on the stream" ] [ "streams", "writeStr", "unsafe command streams.writeStr(X_0:streams, X_1:str):void ", "mnstr_write_stringwrap;", "write data on the stream" ] -[ "strimps", "mkstrimp", "pattern strimps.mkstrimp(X_0:bat[:str], X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT"] +[ "strimps", "mkstrimp", "unsafe pattern strimps.mkstrimp(X_0:bat[:str], X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT" ] [ "strimps", "strimpfilter", "pattern strimps.strimpfilter(X_0:str, X_1:str):bit ", "PATstrimpFilter;", "" ] [ "strimps", "strimpfilterjoin", "pattern strimps.strimpfilterjoin(X_0:str, X_1:any, X_2:any, X_3:any, X_4:any, X_5:any) (X_6:bat[:oid], X_7:bat[:str]) ", "PATstrimpFilter;", "" ] [ "strimps", "strimpfilterselect", "pattern strimps.strimpfilterselect(X_0:bat[:str], X_1:bat[:oid], X_2:str, X_3:bit):bat[:oid] ","PATstrimpFilterSelect;", "" ] diff --git a/clients/Tests/MAL-signatures.stable.out.int128 b/clients/Tests/MAL-signatures.stable.out.int128 --- a/clients/Tests/MAL-signatures.stable.out.int128 +++ b/clients/Tests/MAL-signatures.stable.out.int128 @@ -12928,7 +12928,7 @@ [ "streams", "readStr", "unsafe command streams.readStr(X_0:streams):str ", "mnstr_read_stringwrap;", "read string data from the stream" ] [ "streams", "writeInt", "unsafe command streams.writeInt(X_0:streams, X_1:int):void ", "mnstr_writeIntwrap;", "write data on the stream" ] [ "streams", "writeStr", "unsafe command streams.writeStr(X_0:streams, X_1:str):void ", "mnstr_write_stringwrap;", "write data on the stream" ] -[ "strimps", "mkstrimp", "pattern strimps.mkstrimp(X_0:bat[:str], X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT"] +[ "strimps", "mkstrimp", "unsafe pattern strimps.mkstrimp(X_0:bat[:str], X_1:bat[:oid]):void ", "PATstrimpCreate;", "construct the strimp a BAT" ] [ "strimps", "strimpfilter", "pattern strimps.strimpfilter(X_0:str, X_1:str):bit ", "PATstrimpFilter;", "" ] [ "strimps", "strimpfilterjoin", "pattern strimps.strimpfilterjoin(X_0:str, X_1:any, X_2:any, X_3:any, X_4:any, X_5:any) (X_6:bat[:oid], X_7:bat[:str]) ", "PATstrimpFilter;", "" ] [ "strimps", "strimpfilterselect", "pattern strimps.strimpfilterselect(X_0:bat[:str], X_1:bat[:oid], X_2:str, X_3:bit):bat[:oid] ","PATstrimpFilterSelect;", "" ] diff --git a/sql/test/emptydb/Tests/check.stable.out b/sql/test/emptydb/Tests/check.stable.out ---
MonetDB: string_imprints - Add tests specific to strimps
Changeset: 34589d226cb2 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/34589d226cb2 Added Files: sql/test/strimps/Tests/All sql/test/strimps/Tests/persisted_strimp.SQL.py sql/test/strimps/Tests/strimps_stable_counts.SQL.py Branch: string_imprints Log Message: Add tests specific to strimps diffs (129 lines): diff --git a/sql/test/strimps/Tests/All b/sql/test/strimps/Tests/All new file mode 100644 --- /dev/null +++ b/sql/test/strimps/Tests/All @@ -0,0 +1,2 @@ +strimps_stable_counts +persisted_strimp diff --git a/sql/test/strimps/Tests/persisted_strimp.SQL.py b/sql/test/strimps/Tests/persisted_strimp.SQL.py new file mode 100644 --- /dev/null +++ b/sql/test/strimps/Tests/persisted_strimp.SQL.py @@ -0,0 +1,57 @@ +import os +import socket +import tempfile + +try: +from MonetDBtesting import process +except ImportError: +import process +from MonetDBtesting.sqltest import SQLTestCase + +COUNT_QUERY = "SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';" + +sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +sock.bind(('', 0)) +port = sock.getsockname()[1] +sock.close() + +# Make sure that reading a persisted strimp from disk gives correct +# results. + +with tempfile.TemporaryDirectory() as farm_dir: +fdir = os.path.join(farm_dir, 'db1') +os.mkdir(fdir) +with process.server(mapiport=port, dbname='db1', +args=["--set", "gdk_use_strimps=yes",], +dbfarm=fdir, +stdin=process.PIPE, +stdout=process.PIPE, +stderr=process.PIPE) as s: +with SQLTestCase() as mdb: +mdb.connect(database='db1', port=port, username='monetdb', password='monetdb') +mdb.execute("""CREATE TABLE orders ( + o_orderkey BIGINT NOT NULL, + o_custkeyINTEGER NOT NULL, + o_orderstatusCHAR(1) NOT NULL, + o_totalprice DECIMAL(15,2) NOT NULL, + o_orderdate DATE NOT NULL, + o_orderpriority CHAR(15) NOT NULL, + o_clerk CHAR(15) NOT NULL, + o_shippriority INTEGER NOT NULL, + o_commentVARCHAR(79) NOT NULL);""").assertSucceeded() +mdb.execute("""COPY 15000 RECORDS INTO orders from r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS '|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded() +mdb.execute("""COPY 15000 RECORDS INTO orders from r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS '|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded() +mdb.execute("""COPY 15000 RECORDS INTO orders from r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS '|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded() +mdb.execute("""COPY 15000 RECORDS INTO orders from r'{}/sql/benchmarks/tpch/SF-0.01/orders.tbl' USING DELIMITERS '|','\n','"';""".format(os.getenv('TSTSRCBASE'))).assertSucceeded() +mdb.execute("SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';").assertSucceeded().assertDataResultMatch([(12896,)]) +s.communicate() + +with process.server(mapiport=port, dbname='db1', +args=["--set", "gdk_use_strimps=yes",], +dbfarm=fdir, +stdin=process.PIPE, stdout=process.PIPE, stderr=process.PIPE) as s: +with SQLTestCase() as mdb: +mdb.connect(database='db1', port=port, username='monetdb', password='monetdb') +mdb.execute("SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';").assertSucceeded().assertDataResultMatch([(12896,)]) +mdb.execute("SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';").assertSucceeded().assertDataResultMatch([(12896,)]) +s.communicate() diff --git a/sql/test/strimps/Tests/strimps_stable_counts.SQL.py b/sql/test/strimps/Tests/strimps_stable_counts.SQL.py new file mode 100644 --- /dev/null +++ b/sql/test/strimps/Tests/strimps_stable_counts.SQL.py @@ -0,0 +1,55 @@ +import os +import socket +import tempfile + +try: +from MonetDBtesting import process +except ImportError: +import process +from MonetDBtesting.sqltest import SQLTestCase + +COUNT_QUERY = "SELECT COUNT(*) FROM orders WHERE o_comment LIKE '%%slyly%%';" + +sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +sock.bind(('', 0)) +port = sock.getsockname()[1] +sock.close() + +# Make sure that using a strimp returns the same number of rows as +# not using it. + +with tempfile.TemporaryDirectory() as farm_dir: +fdir = os.path.join(farm_dir, 'db1') +os.mkdir(fdir) +with process.server(mapiport=port, dbname='db1', +dbfarm=fdir, +
MonetDB: string_imprints - Automated merge with ssh://dev.monetd...
Changeset: caf72b9fc8d4 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/caf72b9fc8d4 Branch: string_imprints Log Message: Automated merge with ssh://dev.monetdb.org/MonetDB diffs (192 lines): diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c --- a/gdk/gdk_align.c +++ b/gdk/gdk_align.c @@ -378,7 +378,6 @@ VIEWdestroy(BAT *b) IMPSdestroy(b); OIDXdestroy(b); PROPdestroy(b); - STRMPdestroy(b); VIEWunlink(b); MT_lock_set(>theaplock); diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -229,6 +229,8 @@ void IMPSincref(Imprints *imprints) void IMPSprint(BAT *b) /* never called: for debugging only */ __attribute__((__cold__)); #endif +void STRMPincref(Strimps *strimps) + __attribute__((__visibility__("hidden"))); void STRMPdecref(Strimps *strimps, bool remove) __attribute__((__visibility__("hidden"))); void STRMPdestroy(BAT *b) diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -447,6 +447,8 @@ BATcheckstrimps(BAT *b) hp->bitstrings_base = hp->strimps.base + hsize;/* bitmasks just after the pairs */ close(fd); + ATOMIC_INIT(>strimps.refs, 1); + // STRMPincref(hp); hp->strimps.parentid = b->batCacheid; b->tstrimps = hp; TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b)); @@ -499,12 +501,16 @@ STRMPfilter(BAT *b, BAT *s, const str q) BAT *pb = BBP_cache(VIEWtparent(b)); if (!BATcheckstrimps(pb)) goto sfilter_fail; + MT_lock_set(>batIdxLock); strmps = pb->tstrimps; + MT_lock_unset(>batIdxLock); } else { if (!BATcheckstrimps(b)) goto sfilter_fail; + MT_lock_set(>batIdxLock); strmps = b->tstrimps; + MT_lock_unset(>batIdxLock); } ncand = canditer_init(, b, s); @@ -666,7 +672,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) r->bitstrings_base = h2; r->strimps.free = sz; r->rec_cnt = 0; - + ATOMIC_INIT(>strimps.refs, 1); } return r; } @@ -677,6 +683,7 @@ STRMPcreate(BAT *b, BAT *s) lng t0 = 0; BAT *pb; + MT_thread_setalgorithm("create strimp index"); TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); if (ATOMstorage(b->ttype) != TYPE_str) { GDKerror("Cannot create strimps index for non string bats\n"); @@ -709,7 +716,6 @@ STRMPcreate(BAT *b, BAT *s) MT_lock_unset(>batIdxLock); return GDK_FAIL; } - HEAPincref(>strimps); dh = (uint64_t *)r->bitstrings_base; /* Compute bitstrings */ @@ -789,20 +795,82 @@ STRMPappendBitstring(BAT *b, const str s } void +STRMPbatdecref(BAT *b, bool remove) +{ + Strimps *strimps; + BAT *pb = NULL; + + if (VIEWtparent(b)) { + pb = BBP_cache(VIEWtparent(b)); + assert(pb); + } else { + pb = b; + } + + MT_lock_set(>batIdxLock); + if (pb && pb->tstrimps && pb->tstrimps != (Strimps *)1) { + strimps = pb->tstrimps; + } else { + MT_lock_unset(>batIdxLock); + return; + } + STRMPdecref(strimps, remove); + MT_lock_unset(>batIdxLock); +} + +void +STRMPbatincref(BAT *b) +{ + Strimps *strimps; + BAT *pb = NULL; + + if (VIEWtparent(b)) { + pb = BBP_cache(VIEWtparent(b)); + assert(pb); + } else { + pb = b; + } + + MT_lock_set(>batIdxLock); + if (pb && pb->tstrimps && pb->tstrimps != (Strimps *)1) { + strimps = pb->tstrimps; + } else { + MT_lock_unset(>batIdxLock); + return; + } + STRMPincref(strimps); + MT_lock_unset(>batIdxLock); + +} + +void STRMPdecref(Strimps *strimps, bool remove) { + TRC_DEBUG(ACCELERATOR, "Decrement ref count of %s to " ULLFMT "\n", + strimps->strimps.filename, ATOMIC_GET(>strimps.refs) - 1); strimps->strimps.remove |= remove; if (ATOMIC_DEC(>strimps.refs) == 0) { ATOMIC_DESTROY(>strimps.refs); HEAPfree(>strimps, strimps->strimps.remove); GDKfree(strimps); } + +} + +void +STRMPincref(Strimps *strimps) +{ +
MonetDB: string_imprints - Take a lock before getting the strimp
Changeset: f7d7df6b897a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/f7d7df6b897a Modified Files: gdk/gdk_align.c gdk/gdk_strimps.c Branch: string_imprints Log Message: Take a lock before getting the strimp This needs some more thought diffs (49 lines): diff --git a/gdk/gdk_align.c b/gdk/gdk_align.c --- a/gdk/gdk_align.c +++ b/gdk/gdk_align.c @@ -378,7 +378,6 @@ VIEWdestroy(BAT *b) IMPSdestroy(b); OIDXdestroy(b); PROPdestroy(b); - STRMPdestroy(b); VIEWunlink(b); MT_lock_set(>theaplock); diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -501,12 +501,16 @@ STRMPfilter(BAT *b, BAT *s, const str q) BAT *pb = BBP_cache(VIEWtparent(b)); if (!BATcheckstrimps(pb)) goto sfilter_fail; + MT_lock_set(>batIdxLock); strmps = pb->tstrimps; + MT_lock_unset(>batIdxLock); } else { if (!BATcheckstrimps(b)) goto sfilter_fail; + MT_lock_set(>batIdxLock); strmps = b->tstrimps; + MT_lock_unset(>batIdxLock); } ncand = canditer_init(, b, s); @@ -793,12 +797,17 @@ STRMPappendBitstring(BAT *b, const str s void STRMPdecref(Strimps *strimps, bool remove) { + TRC_DEBUG(ACCELERATOR, "Decrement ref count of %s to " ULLFMT "\n", + strimps->strimps.filename, ATOMIC_GET(>strimps.refs) - 1); strimps->strimps.remove |= remove; if (ATOMIC_DEC(>strimps.refs) == 0) { ATOMIC_DESTROY(>strimps.refs); HEAPfree(>strimps, strimps->strimps.remove); GDKfree(strimps); } + +} + void STRMPincref(Strimps *strimps) { ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Approve test
Changeset: 838e562ffb2a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/838e562ffb2a Modified Files: clients/Tests/exports.stable.out Branch: string_imprints Log Message: Approve test diffs (12 lines): diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -394,6 +394,8 @@ BUN SORTfnd(BAT *b, const void *v); BUN SORTfndfirst(BAT *b, const void *v); BUN SORTfndlast(BAT *b, const void *v); gdk_return STRMPappendBitstring(BAT *b, const str s); +void STRMPbatdecref(BAT *, bool); +void STRMPbatincref(BAT *); gdk_return STRMPcreate(BAT *b, BAT *s); BAT *STRMPfilter(BAT *b, BAT *s, const str q); MT_Id THRcreate(void (*f)(void *), void *arg, enum MT_thr_detach d, const char *name); ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Initialize heap ref counts on contstr...
Changeset: 53920a0f4f9a for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/53920a0f4f9a Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message: Initialize heap ref counts on contstruction diffs (53 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -447,6 +447,8 @@ BATcheckstrimps(BAT *b) hp->bitstrings_base = hp->strimps.base + hsize;/* bitmasks just after the pairs */ close(fd); + ATOMIC_INIT(>strimps.refs, 1); + // STRMPincref(hp); hp->strimps.parentid = b->batCacheid; b->tstrimps = hp; TRC_DEBUG(ACCELERATOR, "BATcheckstrimps(" ALGOBATFMT "): reusing persisted strimp\n", ALGOBATPAR(b)); @@ -666,7 +668,7 @@ STRMPcreateStrimpHeap(BAT *b, BAT *s) r->bitstrings_base = h2; r->strimps.free = sz; r->rec_cnt = 0; - + ATOMIC_INIT(>strimps.refs, 1); } return r; } @@ -677,6 +679,7 @@ STRMPcreate(BAT *b, BAT *s) lng t0 = 0; BAT *pb; + MT_thread_setalgorithm("create strimp index"); TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec(); if (ATOMstorage(b->ttype) != TYPE_str) { GDKerror("Cannot create strimps index for non string bats\n"); @@ -709,7 +712,6 @@ STRMPcreate(BAT *b, BAT *s) MT_lock_unset(>batIdxLock); return GDK_FAIL; } - HEAPincref(>strimps); dh = (uint64_t *)r->bitstrings_base; /* Compute bitstrings */ @@ -810,6 +812,7 @@ void STRMPdestroy(BAT *b) { if (b && b->tstrimps) { + TRC_DEBUG(ACCELERATOR, "Destroying strimp %s\n", b->tstrimps->strimps.filename); MT_lock_set(>batIdxLock); if (b->tstrimps == (Strimps *)1) { b->tstrimps = NULL; @@ -829,6 +832,7 @@ void STRMPfree(BAT *b) { if (b && b->tstrimps) { + TRC_DEBUG(ACCELERATOR, "Freeing strimp for BAT %s\n", b->tstrimps->strimps.filename); Strimps *s; MT_lock_set(>batIdxLock); if ((s = b->tstrimps) != NULL && s != (Strimps *)1) { ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Add STRMPincref gdk private function
Changeset: 3f012d974065 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/3f012d974065 Modified Files: gdk/gdk_private.h gdk/gdk_strimps.c Branch: string_imprints Log Message: Add STRMPincref gdk private function diffs (29 lines): diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -229,6 +229,8 @@ void IMPSincref(Imprints *imprints) void IMPSprint(BAT *b) /* never called: for debugging only */ __attribute__((__cold__)); #endif +void STRMPincref(Strimps *strimps) + __attribute__((__visibility__("hidden"))); void STRMPdecref(Strimps *strimps, bool remove) __attribute__((__visibility__("hidden"))); void STRMPdestroy(BAT *b) diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -797,6 +797,13 @@ STRMPdecref(Strimps *strimps, bool remov HEAPfree(>strimps, strimps->strimps.remove); GDKfree(strimps); } +void +STRMPincref(Strimps *strimps) +{ + TRC_DEBUG(ACCELERATOR, "Increment ref count of %s to " ULLFMT "\n", + strimps->strimps.filename, ATOMIC_GET(>strimps.refs) + 1); + (void)ATOMIC_INC(>strimps.refs); + } void ___ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list
MonetDB: string_imprints - Merge with default
Changeset: 99f9be40d724 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/99f9be40d724 Modified Files: monetdb5/modules/mal/batExtensions.c monetdb5/modules/mal/pcre.c sql/backends/monet5/sql.c sql/test/emptydb/Tests/check.stable.out sql/test/emptydb/Tests/check.stable.out.int128 Branch: string_imprints Log Message: Merge with default diffs (truncated from 1709 to 300 lines): diff --git a/clients/mapiclient/dump.c b/clients/mapiclient/dump.c --- a/clients/mapiclient/dump.c +++ b/clients/mapiclient/dump.c @@ -2391,7 +2391,7 @@ dump_database(Mapi mid, stream *toConsol const char *sequences2 = "SELECT s.name, " "seq.name, " -"get_value_for(s.name, seq.name), " +"peak_next_value_for(s.name, seq.name), " "seq.\"minvalue\", " "seq.\"maxvalue\", " "seq.\"increment\", " diff --git a/common/utils/matomic.h b/common/utils/matomic.h --- a/common/utils/matomic.h +++ b/common/utils/matomic.h @@ -57,7 +57,6 @@ /* #define NO_ATOMIC_INSTRUCTIONS */ /* the atomic type we export is always a 64 bit unsigned integer */ -typedef uint64_t ATOMIC_BASE_TYPE; /* ignore __STDC_NO_ATOMICS__ if compiling using Intel compiler on * Windows since otherwise we can't compile this at all in C99 mode */ @@ -67,8 +66,10 @@ typedef uint64_t ATOMIC_BASE_TYPE; #if SIZEOF_LONG_LONG == 8 typedef volatile atomic_ullong ATOMIC_TYPE; +typedef unsigned long long ATOMIC_BASE_TYPE; #elif SIZEOF_LONG == 8 typedef volatile atomic_ulong ATOMIC_TYPE; +typedef unsigned long ATOMIC_BASE_TYPE; #else #error "we need a 64 bit atomic type" #endif @@ -106,6 +107,8 @@ typedef volatile atomic_flag ATOMIC_FLAG #elif defined(_MSC_VER) && !defined(NO_ATOMIC_INSTRUCTIONS) +typedef uint64_t ATOMIC_BASE_TYPE; + #include /* On Windows, with Visual Studio 2005, the compiler uses acquire @@ -227,6 +230,7 @@ typedef volatile int ATOMIC_FLAG; /* the new way of doing this according to GCC (the old way, using * __sync_* primitives is not supported) */ +typedef uint64_t ATOMIC_BASE_TYPE; typedef volatile ATOMIC_BASE_TYPE ATOMIC_TYPE; #define ATOMIC_VAR_INIT(val) (val) @@ -262,6 +266,8 @@ typedef volatile char ATOMIC_FLAG; /* emulate using mutexes */ +typedef uint64_t ATOMIC_BASE_TYPE; + #include /* required for pthread_mutex_t */ typedef struct { diff --git a/monetdb5/mal/mal_client.c b/monetdb5/mal/mal_client.c --- a/monetdb5/mal/mal_client.c +++ b/monetdb5/mal/mal_client.c @@ -598,7 +598,7 @@ MCreadClient(Client c) in->pos++; if (in->pos >= in->len || in->mode) { - ssize_t rd, sum = 0; + ssize_t rd; if (in->eof || !isa_block_stream(c->fdout)) { if (!isa_block_stream(c->fdout) && c->promptlength > 0) @@ -607,7 +607,6 @@ MCreadClient(Client c) in->eof = false; } while ((rd = bstream_next(in)) > 0 && !in->eof) { - sum += rd; if (!in->mode) /* read one line at a time in line mode */ break; } diff --git a/monetdb5/modules/mal/batExtensions.c b/monetdb5/modules/mal/batExtensions.c --- a/monetdb5/modules/mal/batExtensions.c +++ b/monetdb5/modules/mal/batExtensions.c @@ -100,7 +100,7 @@ CMDBATsingle(Client cntxt, MalBlkPtr mb, if( b == 0) throw(MAL,"bat.single", SQLSTATE(HY013) MAL_MALLOC_FAIL); if (ATOMextern(b->ttype)) - u = (ptr) *(str *)u; + u = (ptr) *(ptr *)u; if (BUNappend(b, u, false) != GDK_SUCCEED) { BBPreclaim(b); throw(MAL, "bat.single", SQLSTATE(HY013) MAL_MALLOC_FAIL); diff --git a/monetdb5/modules/mal/mal_io.c b/monetdb5/modules/mal/mal_io.c --- a/monetdb5/modules/mal/mal_io.c +++ b/monetdb5/modules/mal/mal_io.c @@ -120,8 +120,8 @@ IOprintBoth(Client cntxt, MalBlkPtr mb, if (hd) mnstr_printf(fp, "%s", hd); - if (ATOMvarsized(tpe)) - ATOMprint(tpe, *(str *) val, fp); + if (ATOMextern(tpe)) + ATOMprint(tpe, *(ptr *) val, fp); else ATOMprint(tpe, val, fp); diff --git a/monetdb5/modules/mal/pcre.c b/monetdb5/modules/mal/pcre.c --- a/monetdb5/modules/mal/pcre.c +++ b/monetdb5/modules/mal/pcre.c @@ -285,8 +285,6 @@ mywstrcasestr(const char *restrict hayst if (nlen == 0) return atend ? haystack + strlen(haystack) : haystack; - size_t hlen = strlen(haystack); - while (*haystack) { size_t i; size_t h; @@ -312,7 +310,6 @@ mywstrcasestr(const char *restrict hayst if (i == nlen && (!atend || haystack[h] == 0)) return haystack; haystack += step; - hlen -= step;