Changeset: 0cd7b7d1e433 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/0cd7b7d1e433
Modified Files:
        monetdb5/modules/atoms/str.c
Branch: Dec2023
Log Message:

Backport some changes from the default branch.


diffs (truncated from 334 to 300 lines):

diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -108,7 +108,7 @@
  */
 
 /* These tables were generated from the Unicode 13.0.0 spec. */
-const struct UTF8_lower_upper {
+static const struct UTF8_lower_upper {
        const unsigned int from, to;
 } UTF8_toUpper[] = {                   /* code points with non-null uppercase 
conversion */
        {0x0061, 0x0041,},
@@ -3102,55 +3102,29 @@ STRepilogue(void *ret)
 }
 
 #ifndef NDEBUG
-static void
-UTF8_assert(const char *restrict s)
+static inline void
+UTF8_assert(const char *s)
 {
-       int c;
-
-       if (s == NULL)
-               return;
-       if (*s == '\200' && s[1] == '\0')
-               return;                                 /* str_nil */
-       while ((c = *s++) != '\0') {
-               if ((c & 0x80) == 0)
-                       continue;
-               if ((*s++ & 0xC0) != 0x80)
-                       assert(0);
-               if ((c & 0xE0) == 0xC0)
-                       continue;
-               if ((*s++ & 0xC0) != 0x80)
-                       assert(0);
-               if ((c & 0xF0) == 0xE0)
-                       continue;
-               if ((*s++ & 0xC0) != 0x80)
-                       assert(0);
-               if ((c & 0xF8) == 0xF0)
-                       continue;
-               assert(0);
-       }
+       assert(strNil(s) || utf8valid(s) == 0);
 }
 #else
 #define UTF8_assert(s)         ((void) 0)
 #endif
 
+/* return how many codepoints in the substring end in s starts */
 static inline int
 UTF8_strpos(const char *s, const char *end)
 {
-       int pos = 0;
-
        UTF8_assert(s);
 
        if (s > end) {
                return -1;
        }
-       while (s < end) {
-               /* just count leading bytes of encoded code points; only works
-                * for correctly encoded UTF-8 */
-               pos += (*s++ & 0xC0) != 0x80;
-       }
-       return pos;
+       return (int) utf8nlen(s, (size_t) (end - s));
 }
 
+/* return a pointer to the byte that starts the pos'th (0-based)
+ * codepoint in s */
 static inline str
 UTF8_strtail(const char *s, int pos)
 {
@@ -3166,6 +3140,7 @@ UTF8_strtail(const char *s, int pos)
        return (str) s;
 }
 
+/* copy n Unicode codepoints from s to dst, return pointer to new end */
 static inline str
 UTF8_strncpy(char *restrict dst, const char *restrict s, int n)
 {
@@ -3196,56 +3171,29 @@ UTF8_strncpy(char *restrict dst, const c
        return dst;
 }
 
-static inline str
-UTF8_offset(char *restrict s, int n)
-{
-       UTF8_assert(s);
-       while (*s && n) {
-               if ((*s & 0xF8) == 0xF0) {
-                       /* 4 byte UTF-8 sequence */
-                       s += 4;
-               } else if ((*s & 0xF0) == 0xE0) {
-                       /* 3 byte UTF-8 sequence */
-                       s += 3;
-               } else if ((*s & 0xE0) == 0xC0) {
-                       /* 2 byte UTF-8 sequence */
-                       s += 2;
-               } else {
-                       /* 1 byte UTF-8 "sequence" */
-                       s++;
-               }
-               n--;
-       }
-       return s;
-}
-
+/* return number of Unicode codepoints in s; s is not nil */
 int
-UTF8_strlen(const char *restrict s)
-{                                                              /* This 
function assumes, s is never nil */
-       size_t pos = 0;
-
+UTF8_strlen(const char *s)
+{                                                              /* This 
function assumes s is never nil */
        UTF8_assert(s);
        assert(!strNil(s));
 
-       while (*s) {
-               /* just count leading bytes of encoded code points; only works
-                * for correctly encoded UTF-8 */
-               pos += (*s++ & 0xC0) != 0x80;
-       }
-       assert(pos < INT_MAX);
-       return (int) pos;
+       return (int) utf8len(s);
 }
 
+/* return (int) strlen(s); s is not nil */
 int
-str_strlen(const char *restrict s)
-{                                                              /* This 
function assumes, s is never nil */
-       size_t pos = strlen(s);
-       assert(pos < INT_MAX);
-       return (int) pos;
+str_strlen(const char *s)
+{                                                              /* This 
function assumes s is never nil */
+       UTF8_assert(s);
+       assert(!strNil(s));
+
+       return (int) strlen(s);
 }
 
+/* return the display width of s */
 int
-UTF8_strwidth(const char *restrict s)
+UTF8_strwidth(const char *s)
 {
        int len = 0;
        int c;
@@ -3632,6 +3580,10 @@ STRTail(str *res, const str *arg1, const
        return msg;
 }
 
+/* copy the substring s[off:off+l] into *buf, replacing *buf with a
+ * freshly allocated buffer if the substring doesn't fit; off is 0
+ * based, and both off and l count in Unicode codepoints (i.e. not
+ * bytes); if off < 0, off counts from the end of the string */
 str
 str_Sub_String(str *buf, size_t *buflen, const char *s, int off, int l)
 {
@@ -3905,10 +3857,14 @@ STRstartswith(Client cntxt, MalBlkPtr mb
 
        STR_MAPARGS(stk, pci, r, s1, s2, icase);
 
-       int s2_len = str_strlen(s2);
-       *r = (strNil(s1) || strNil(s2)) ? bit_nil :
-               icase ? str_is_iprefix(s1, s2, s2_len) == 0 :
-                       str_is_prefix(s1, s2, s2_len) == 0;
+       if (strNil(s1) || strNil(s2)) {
+               *r = bit_nil;
+       } else {
+               int s2_len = str_strlen(s2);
+               *r = icase ?
+                       str_is_iprefix(s1, s2, s2_len) == 0 :
+                       str_is_prefix(s1, s2, s2_len) == 0;
+       }
        return MAL_SUCCEED;
 }
 
@@ -3923,10 +3879,14 @@ STRendswith(Client cntxt, MalBlkPtr mb, 
 
        STR_MAPARGS(stk, pci, r, s1, s2, icase);
 
-       int s2_len = str_strlen(s2);
-       *r = (strNil(s1) || strNil(s2)) ? bit_nil :
-               icase ? str_is_isuffix(s1, s2, s2_len) == 0 :
-                       str_is_suffix(s1, s2, s2_len) == 0;
+       if (strNil(s1) || strNil(s2)) {
+               *r = bit_nil;
+       } else {
+               int s2_len = str_strlen(s2);
+               *r = icase ?
+                       str_is_isuffix(s1, s2, s2_len) == 0 :
+                       str_is_suffix(s1, s2, s2_len) == 0;
+       }
        return MAL_SUCCEED;
 }
 
@@ -3942,10 +3902,14 @@ STRcontains(Client cntxt, MalBlkPtr mb, 
 
        STR_MAPARGS(stk, pci, r, s1, s2, icase);
 
-       int s2_len = str_strlen(s2);
-       *r = (strNil(s1) || strNil(s2)) ? bit_nil :
-               icase ? str_icontains(s1, s2, s2_len) == 0 :
-                       str_contains(s1, s2, s2_len) == 0;
+       if (strNil(s1) || strNil(s2)) {
+               *r = bit_nil;
+       } else {
+               int s2_len = str_strlen(s2);
+               *r = icase ?
+                       str_icontains(s1, s2, s2_len) == 0 :
+                       str_contains(s1, s2, s2_len) == 0;
+       }
        return MAL_SUCCEED;
 }
 
@@ -3983,11 +3947,15 @@ STRstr_search(Client cntxt, MalBlkPtr mb
        bit icase = pci->argc == 4
                        && *getArgReference_bit(stk, pci, 3) ? true : false;
        str s = *haystack, h = *needle, msg = MAL_SUCCEED;
-       int needle_len = str_strlen(h);
-
-       *res = (strNil(s) || strNil(h)) ? bit_nil :
-                       icase ? str_isearch(s, h, needle_len) : str_search(s, h,
-                                                                               
                                           needle_len);
+       if (strNil(s) || strNil(h)) {
+               *res = bit_nil;
+       } else {
+               int needle_len = str_strlen(h);
+
+               *res = icase ?
+                       str_isearch(s, h, needle_len) :
+                       str_search(s, h, needle_len);
+       }
        return msg;
 }
 
@@ -4041,12 +4009,15 @@ STRrevstr_search(Client cntxt, MalBlkPtr
        bit icase = pci->argc == 4
                        && *getArgReference_bit(stk, pci, 3) ? true : false;
        str s = *haystack, h = *needle, msg = MAL_SUCCEED;
-       int needle_len = str_strlen(h);
-
-       *res = (strNil(s) || strNil(h)) ? bit_nil :
-                       icase ? str_reverse_str_isearch(s, h,
-                                                                               
        needle_len) :
+       if (strNil(s) || strNil(h)) {
+               *res = bit_nil;
+       } else {
+               int needle_len = str_strlen(h);
+
+               *res = icase ?
+                       str_reverse_str_isearch(s, h, needle_len) :
                        str_reverse_str_search(s, h, needle_len);
+       }
        return msg;
 }
 
@@ -4966,8 +4937,9 @@ STRlocate3(int *ret, const str *needle, 
        const char *s = *needle, *s2 = *haystack;
        int st = *start;
 
-       *ret = (strNil(s) || strNil(s2)
-                       || is_int_nil(st)) ? int_nil : str_locate2(s, s2, st);
+       *ret = (strNil(s) || strNil(s2) || is_int_nil(st)) ?
+               int_nil :
+               str_locate2(s, s2, st);
        return MAL_SUCCEED;
 }
 
@@ -5008,7 +4980,7 @@ str_insert(str *buf, size_t *buflen, con
                v = UTF8_strncpy(v, s, strt);
        strcpy(v, s2);
        if (strt + l < l1)
-               strcat(v, UTF8_offset((char *) s, strt + l));
+               strcat(v, UTF8_strtail((char *) s, strt + l));
        return MAL_SUCCEED;
 }
 
@@ -5182,10 +5154,10 @@ BBPnreclaim(int nargs, ...)
 /* scan select loop with or without candidates */
 #define scanloop(TEST, KEEP_NULLS)                                             
                            \
        do {                                                                    
                                                        \
-               TRC_DEBUG(ALGO,                                                 
                                        \
-                                 "scanselect(b=%s#"BUNFMT",anti=%d): "         
                \
+               TRC_DEBUG(ALGO,                                                 
                                                \
+                                 "scanselect(b=%s#"BUNFMT",anti=%d): "         
                        \
                                  "scanselect %s\n", BATgetId(b), BATcount(b),  
                \
-                                 anti, #TEST);                                 
                                        \
+                                 anti, #TEST);                                 
                                                \
                if (!s || BATtdense(s)) {                                       
                                        \
                        for (; p < q; p++) {                                    
                                        \
                                GDK_CHECK_TIMEOUT(timeoffset, counter,          
                        \
@@ -5207,12 +5179,12 @@ BBPnreclaim(int nargs, ...)
        } while (0)
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to