Changeset: fbcd6ce89476 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fbcd6ce89476 Modified Files: gdk/gdk_strimps.c Branch: string_imprints Log Message:
Count byte pairs instead of unicode character pairs diffs (83 lines): diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c --- a/gdk/gdk_strimps.c +++ b/gdk/gdk_strimps.c @@ -13,33 +13,33 @@ /* This counts how many unicode codepoints the given string * contains. */ -static size_t -GDKstrimp_strlen(const uint8_t *s) -{ - size_t ret = 0; - size_t i; - int m,n; - uint8_t c; +/* static size_t */ +/* GDKstrimp_strlen(const uint8_t *s) */ +/* { */ +/* size_t ret = 0; */ +/* size_t i; */ +/* int m,n; */ +/* uint8_t c; */ - i = 0; - while((c = *(s + i)) != 0) { - if (c < 0x80) - i++; - else { - for (n = 0, m=0x40; c & m; n++, m >>= 1) - ; - /* n is now the number of 10xxxxxx bytes that should - follow. */ - if (n == 0 || n >= 4) - /* TODO: handle invalid utf-8 */ - {} - i += n+1; - } - ret++; - } +/* i = 0; */ +/* while((c = *(s + i)) != 0) { */ +/* if (c < 0x80) */ +/* i++; */ +/* else { */ +/* for (n = 0, m=0x40; c & m; n++, m >>= 1) */ +/* ; */ +/* /\* n is now the number of 10xxxxxx bytes that should */ +/* follow. *\/ */ +/* if (n == 0 || n >= 4) */ +/* /\* TODO: handle invalid utf-8 *\/ */ +/* {} */ +/* i += n+1; */ +/* } */ +/* ret++; */ +/* } */ - return ret; -} +/* return ret; */ +/* } */ /* Given a BAT return the number of digrams in it. The observation is * that the number of digrams is the number of characters - 1: @@ -55,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) // lng t0; BUN i; BATiter bi; - uint8_t *s; + char *s; // GDKtracer_set_component_level("ALGO", "DEBUG"); // struct canditer ci; @@ -66,8 +66,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n) bi = bat_iterator(b); *n = 0; for (i = 0; i < b->batCount; i++) { - s = (uint8_t *)BUNtail(bi, i); - *n += GDKstrimp_strlen(s) - 1; + s = (char *)BUNtail(bi, i); + // *n += GDKstrimp_strlen(s) - 1; + *n += strlen(s) - 1; // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s); } _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list