Changeset: fabfd34343c3 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fabfd34343c3 Added Files: gdk/gdk_strimps.c gdk/gdk_strimps.h Modified Files: gdk/CMakeLists.txt gdk/gdk.h monetdb5/modules/mal/01_calc.mal monetdb5/modules/mal/batcalc.c Branch: string_imprints Log Message:
Count the total number of digrams in a string bat diffs (190 lines): diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt --- a/gdk/CMakeLists.txt +++ b/gdk/CMakeLists.txt @@ -78,6 +78,7 @@ target_sources(bat gdk_analytic_func.c gdk_analytic.h gdk_tracer.c gdk_tracer.h + gdk_strimps.c gdk_strimps.h PUBLIC ${gdk_public_headers}) diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2113,4 +2113,9 @@ gdk_export BAT *BATsample_with_seed(BAT */ #define MAXPARAMS 32 +/* + * String Imprints Development/Testing. TODO: remove the following. + */ + +#include "gdk_strimps.h" #endif /* _GDK_H_ */ diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c new file mode 100644 --- /dev/null +++ b/gdk/gdk_strimps.c @@ -0,0 +1,55 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. + */ + +#include "monetdb_config.h" +#include "gdk.h" +#include "gdk_private.h" + +/* This counts how many unicode codepoints the given string + * contains. + */ +static size_t +GDKstrimp_strlen(const char *s) +{ + return strlen(s); +} + +/* Given a BAT return the number of digrams in it. The observation is + * that the number of digrams is the number of characters - 1: + * + * 1 digram starting at character 1 + * 1 digram starting at character 2 + * [...] + * 1 digram starting at character n - 1 + */ +gdk_return +GDKstrimp_ndigrams(BAT *b, size_t *n) +{ + // lng t0; + BUN i; + BATiter bi; + char *s; + // GDKtracer_set_component_level("ALGO", "DEBUG"); + // struct canditer ci; + + // t0 = GDKusec(); + // BATcheck(b, NULL); + assert(b->ttype == TYPE_str); + + bi = bat_iterator(b); + *n = 0; + for (i = 0; i < b->batCount; i++) { + s = (char *)BUNtail(bi, i); + *n += GDKstrimp_strlen(s) - 1; + // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, (char *)BUNtail(bi, i)); + } + + // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0); + + return GDK_SUCCEED; +} diff --git a/gdk/gdk_strimps.h b/gdk/gdk_strimps.h new file mode 100644 --- /dev/null +++ b/gdk/gdk_strimps.h @@ -0,0 +1,27 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2021 MonetDB B.V. + */ + +#ifndef _GDK_STRIMPS_H_ +#define _GDK_STRIMPS_H_ + +#include <stdint.h> + +#define HISTSIZE 64 + +typedef struct { + uint64_t counts[HISTSIZE]; + char foo; +} Histogram; + +typedef struct { + Histogram* hist; +} Strimp; + +gdk_export gdk_return GDKstrimp_ndigrams(BAT *b, size_t *n); + +#endif /* _GDK_STRIMPS_H_ */ diff --git a/monetdb5/modules/mal/01_calc.mal b/monetdb5/modules/mal/01_calc.mal --- a/monetdb5/modules/mal/01_calc.mal +++ b/monetdb5/modules/mal/01_calc.mal @@ -5593,3 +5593,9 @@ comment "Calculate aggregate string conc pattern str_group_concat(b:bat[:str],sep:bat[:str],s:bat[:oid],nil_if_empty:bit) :str address CMDBATstr_group_concat comment "Calculate aggregate string concatenate of B with candidate list and separator SEP."; + + +# foo +pattern str_iterate_bat(b:bat[:str]) :void; +address CMDstr_iterate_bat +comment "iterate through a bat"; diff --git a/monetdb5/modules/mal/batcalc.c b/monetdb5/modules/mal/batcalc.c --- a/monetdb5/modules/mal/batcalc.c +++ b/monetdb5/modules/mal/batcalc.c @@ -1368,6 +1368,39 @@ CMDifthen(Client cntxt, MalBlkPtr mb, Ma return MAL_SUCCEED; } + +/* + * String imprints dev/testing. TODO: remove. + */ +static str +CMDstrimp_ndigrams(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +{ + bat bid; + BAT *b; + size_t n; + + (void)cntxt; + (void)mb; + + // return mythrow(MAL, "batcalc.striter", OPERATION_FAILED); + bid = *getArgReference_bat(stk, pci, 1); + if ((b = BATdescriptor(bid)) == NULL) + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) RUNTIME_OBJECT_MISSING); + + if (!GDKstrimp_ndigrams(b, &n)) { + throw(MAL, "batcalc.ndigrams", SQLSTATE(HY002) OPERATION_FAILED); + } + + *getArgReference_lng(stk, pci, 0) = n; + + return MAL_SUCCEED; +} + + +/* + * String imprints dev/testing. TODO: end remove. + */ + #include "mel.h" static str @@ -2187,7 +2220,17 @@ static mel_func batcalc_init_funcs[] = { pattern("batcalc", "ifthenelse", CMDifthen, false, "If-then-else operation to assemble a conditional result", args(1,4, batargany("",1),batarg("b",bit),batargany("b1",1),argany("v2",1))), pattern("batcalc", "ifthenelse", CMDifthen, false, "If-then-else operation to assemble a conditional result", args(1,4, batargany("",1),batarg("b",bit),argany("v1",1),batargany("b2",1))), pattern("batcalc", "ifthenelse", CMDifthen, false, "If-then-else operation to assemble a conditional result", args(1,4, batargany("",1),batarg("b",bit),batargany("b1",1),batargany("b2",1))), + + /* + * String imprints dev/testing. TODO: remove. + */ + pattern("batcalc", "count_digrams", CMDstrimp_ndigrams, false, "count digrams in a string bat", args(1, 2, arg("",lng), batarg("b", str))), + + /* + * String imprints dev/testing. TODO: end remove. + */ { .imp=NULL } + }; #include "mal_import.h" #ifdef _MSC_VER _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list