Changeset: fbcd6ce89476 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fbcd6ce89476
Modified Files:
        gdk/gdk_strimps.c
Branch: string_imprints
Log Message:

Count byte pairs instead of unicode character pairs


diffs (83 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -13,33 +13,33 @@
 /* This counts how many unicode codepoints the given string
  * contains.
  */
-static size_t
-GDKstrimp_strlen(const uint8_t *s)
-{
-       size_t ret = 0;
-       size_t i;
-       int m,n;
-       uint8_t c;
+/* static size_t */
+/* GDKstrimp_strlen(const uint8_t *s) */
+/* { */
+/*     size_t ret = 0; */
+/*     size_t i; */
+/*     int m,n; */
+/*     uint8_t c; */
 
-       i = 0;
-       while((c = *(s + i)) != 0) {
-               if (c < 0x80)
-                       i++;
-               else {
-                       for (n = 0, m=0x40; c & m; n++, m >>= 1)
-                               ;
-                       /* n is now the number of 10xxxxxx bytes that should
-                          follow. */
-                       if (n == 0 || n >= 4)
-                               /* TODO: handle invalid utf-8 */
-                               {}
-                       i += n+1;
-               }
-               ret++;
-       }
+/*     i = 0; */
+/*     while((c = *(s + i)) != 0) { */
+/*             if (c < 0x80) */
+/*                     i++; */
+/*             else { */
+/*                     for (n = 0, m=0x40; c & m; n++, m >>= 1) */
+/*                             ; */
+/*                     /\* n is now the number of 10xxxxxx bytes that should */
+/*                        follow. *\/ */
+/*                     if (n == 0 || n >= 4) */
+/*                             /\* TODO: handle invalid utf-8 *\/ */
+/*                             {} */
+/*                     i += n+1; */
+/*             } */
+/*             ret++; */
+/*     } */
 
-       return ret;
-}
+/*     return ret; */
+/* } */
 
 /* Given a BAT return the number of digrams in it. The observation is
  * that the number of digrams is the number of characters - 1:
@@ -55,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
        // lng t0;
        BUN i;
        BATiter bi;
-       uint8_t *s;
+       char *s;
        // GDKtracer_set_component_level("ALGO", "DEBUG");
        // struct canditer ci;
 
@@ -66,8 +66,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
        bi = bat_iterator(b);
        *n = 0;
        for (i = 0; i < b->batCount; i++) {
-               s = (uint8_t *)BUNtail(bi, i);
-                *n += GDKstrimp_strlen(s) - 1;
+               s = (char *)BUNtail(bi, i);
+                // *n += GDKstrimp_strlen(s) - 1;
+               *n += strlen(s) - 1;
                // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
        }
 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to