Changeset: 56acedc51735 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=56acedc51735
Modified Files:
        clients/Tests/MAL-signatures.stable.out
        clients/Tests/MAL-signatures.stable.out.int128
        sql/storage/store.c
        sql/test/BugTracker-2020/Tests/All
Branch: default
Log Message:

Merged with Oct2020


diffs (truncated from 516 to 300 lines):

diff --git a/clients/Tests/MAL-signatures.stable.out 
b/clients/Tests/MAL-signatures.stable.out
--- a/clients/Tests/MAL-signatures.stable.out
+++ b/clients/Tests/MAL-signatures.stable.out
@@ -6638,6 +6638,7 @@ stdout of test 'MAL-signatures` in direc
 [ "batstr",    "unicodeAt",    "pattern batstr.unicodeAt(X_1:str, 
X_2:bat[:int], X_3:bat[:oid]):bat[:int] ",   "STRbatWChrAt_strcst;", ""      ]
 [ "batstr",    "unicodeAt",    "pattern batstr.unicodeAt(X_1:bat[:str], 
X_2:int):bat[:int] ",  "STRbatWChrAtcst;",     ""      ]
 [ "batstr",    "unicodeAt",    "pattern batstr.unicodeAt(X_1:bat[:str], 
X_2:int, X_3:bat[:oid]):bat[:int] ",   "STRbatWChrAtcst;",     ""      ]
+[ "battxtsim", "similarity",   "command battxtsim.similarity(X_1:bat[:str], 
X_2:bat[:str]):bat[:dbl] ",        "fstrcmp0_impl_bulk;",  ""      ]
 [ "batudf",    "fuse", "command batudf.fuse(X_1:bat[:bte], 
X_2:bat[:bte]):bat[:sht] ", "UDFBATfuse;",  ""      ]
 [ "batudf",    "fuse", "command batudf.fuse(X_1:bat[:int], 
X_2:bat[:int]):bat[:lng] ", "UDFBATfuse;",  ""      ]
 [ "batudf",    "fuse", "command batudf.fuse(X_1:bat[:sht], 
X_2:bat[:sht]):bat[:int] ", "UDFBATfuse;",  ""      ]
diff --git a/clients/Tests/MAL-signatures.stable.out.int128 
b/clients/Tests/MAL-signatures.stable.out.int128
--- a/clients/Tests/MAL-signatures.stable.out.int128
+++ b/clients/Tests/MAL-signatures.stable.out.int128
@@ -9197,6 +9197,7 @@ stdout of test 'MAL-signatures` in direc
 [ "batstr",    "unicodeAt",    "pattern batstr.unicodeAt(X_1:str, 
X_2:bat[:int], X_3:bat[:oid]):bat[:int] ",   "STRbatWChrAt_strcst;", ""      ]
 [ "batstr",    "unicodeAt",    "pattern batstr.unicodeAt(X_1:bat[:str], 
X_2:int):bat[:int] ",  "STRbatWChrAtcst;",     ""      ]
 [ "batstr",    "unicodeAt",    "pattern batstr.unicodeAt(X_1:bat[:str], 
X_2:int, X_3:bat[:oid]):bat[:int] ",   "STRbatWChrAtcst;",     ""      ]
+[ "battxtsim", "similarity",   "command battxtsim.similarity(X_1:bat[:str], 
X_2:bat[:str]):bat[:dbl] ",        "fstrcmp0_impl_bulk;",  ""      ]
 [ "batudf",    "fuse", "command batudf.fuse(X_1:bat[:bte], 
X_2:bat[:bte]):bat[:sht] ", "UDFBATfuse;",  ""      ]
 [ "batudf",    "fuse", "command batudf.fuse(X_1:bat[:int], 
X_2:bat[:int]):bat[:lng] ", "UDFBATfuse;",  ""      ]
 [ "batudf",    "fuse", "command batudf.fuse(X_1:bat[:lng], 
X_2:bat[:lng]):bat[:hge] ", "UDFBATfuse;",  ""      ]
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -263,7 +263,6 @@ soundex_code(char *Name, char *Key)
        }
 }
 
-
 static str
 soundex_impl(str *res, str *Name)
 {
@@ -362,38 +361,6 @@ struct string_data {
        int edit_count;
 };
 
-static struct string_data string[2];
-
-static int max_edits;          /* compareseq stops when edits > max_edits */
-
-#ifdef MINUS_H_FLAG
-
-/* This corresponds to the diff -H flag.  With this heuristic, for
-   strings with a constant small density of changes, the algorithm is
-   linear in the strings size.  This is unlikely in typical uses of
-   fstrcmp, and so is usually compiled out.  Besides, there is no
-   interface to set it true.  */
-static int heuristic;
-
-#endif
-
-
-/* Vector, indexed by diagonal, containing 1 + the X coordinate of the
-   point furthest along the given diagonal in the forward search of the
-   edit matrix.  */
-static int *fdiag;
-
-/* Vector, indexed by diagonal, containing the X coordinate of the point
-   furthest along the given diagonal in the backward search of the edit
-   matrix.  */
-static int *bdiag;
-
-/* Edit scripts longer than this are too expensive to compute.  */
-static int too_expensive;
-
-/* Snakes bigger than this are considered `big'.  */
-#define SNAKE_LIMIT    20
-
 struct partition {
        /* Midpoints of this partition.  */
        int xmid, ymid;
@@ -405,7 +372,6 @@ struct partition {
        int hi_minimal;
 };
 
-
 /* NAME
        diag - find diagonal path
 
@@ -450,10 +416,10 @@ struct partition {
        cause suboptimal diff output.  It cannot cause incorrect diff
        output.  */
 
-static int diag PARAMS((int, int, int, int, int, struct partition *));
+static int diag PARAMS((int, int, int, int, int, struct partition *, int, 
struct string_data *, int *, int *));
 
 static int
-diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition 
*part)
+diag(int xoff, int xlim, int yoff, int ylim, int minimal, struct partition 
*part, int too_expensive, struct string_data *string, int *fdiag, int *bdiag)
 {
        int *const fd = fdiag;  /* Give the compiler a chance. */
        int *const bd = bdiag;  /* Additional help for the compiler. */
@@ -478,9 +444,7 @@ diag(int xoff, int xlim, int yoff, int y
        bd[bmid] = xlim;
        for (c = 1;; ++c) {
                int d;          /* Active diagonal. */
-               int big_snake;
 
-               big_snake = 0;
                /* Extend the top-down search by an edit step in each diagonal. 
*/
                if (fmin > dmin)
                        fd[--fmin - 1] = -1;
@@ -493,7 +457,6 @@ diag(int xoff, int xlim, int yoff, int y
                for (d = fmax; d >= fmin; d -= 2) {
                        int x;
                        int y;
-                       int oldx;
                        int tlo;
                        int thi;
 
@@ -503,14 +466,11 @@ diag(int xoff, int xlim, int yoff, int y
                                x = tlo + 1;
                        else
                                x = thi;
-                       oldx = x;
                        y = x - d;
                        while (x < xlim && y < ylim && xv[x] == yv[y]) {
                                ++x;
                                ++y;
                        }
-                       if (x - oldx > SNAKE_LIMIT)
-                               big_snake = 1;
                        fd[d] = x;
                        if (odd && bmin <= d && d <= bmax && bd[d] <= x) {
                                part->xmid = x;
@@ -531,7 +491,6 @@ diag(int xoff, int xlim, int yoff, int y
                for (d = bmax; d >= bmin; d -= 2) {
                        int x;
                        int y;
-                       int oldx;
                        int tlo;
                        int thi;
 
@@ -540,14 +499,11 @@ diag(int xoff, int xlim, int yoff, int y
                                x = tlo;
                        else
                                x = thi - 1;
-                       oldx = x;
                        y = x - d;
                        while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) {
                                --x;
                                --y;
                        }
-                       if (oldx - x > SNAKE_LIMIT)
-                               big_snake = 1;
                        bd[d] = x;
                        if (!odd && fmin <= d && d <= fmax && x <= fd[d]) {
                                part->xmid = x;
@@ -560,90 +516,6 @@ diag(int xoff, int xlim, int yoff, int y
                if (minimal)
                        continue;
 
-#ifdef MINUS_H_FLAG
-               /* Heuristic: check occasionally for a diagonal that has made 
lots
-                  of progress compared with the edit distance.  If we have any
-                  such, find the one that has made the most progress and return
-                  it as if it had succeeded.
-
-                  With this heuristic, for strings with a constant small 
density
-                  of changes, the algorithm is linear in the strings size.  */
-               if (c > 200 && big_snake && heuristic) {
-                       int best;
-
-                       best = 0;
-                       for (d = fmax; d >= fmin; d -= 2) {
-                               int dd;
-                               int x;
-                               int y;
-                               int v;
-
-                               dd = d - fmid;
-                               x = fd[d];
-                               y = x - d;
-                               v = (x - xoff) * 2 - dd;
-
-                               if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
-                                       if (v > best && xoff + SNAKE_LIMIT <= x 
&& x < xlim && yoff + SNAKE_LIMIT <= y && y < ylim) {
-                                               /* We have a good enough best 
diagonal; now insist
-                                                  that it end with a 
significant snake.  */
-                                               int k;
-
-                                               for (k = 1; xv[x - k] == yv[y - 
k]; k++) {
-                                                       if (k == SNAKE_LIMIT) {
-                                                               best = v;
-                                                               part->xmid = x;
-                                                               part->ymid = y;
-                                                               break;
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
-                       if (best > 0) {
-                               part->lo_minimal = 1;
-                               part->hi_minimal = 0;
-                               return 2 * c - 1;
-                       }
-                       best = 0;
-                       for (d = bmax; d >= bmin; d -= 2) {
-                               int dd;
-                               int x;
-                               int y;
-                               int v;
-
-                               dd = d - bmid;
-                               x = bd[d];
-                               y = x - d;
-                               v = (xlim - x) * 2 + dd;
-
-                               if (v > 12 * (c + (dd < 0 ? -dd : dd))) {
-                                       if (v > best && xoff < x && x <= xlim - 
SNAKE_LIMIT && yoff < y && y <= ylim - SNAKE_LIMIT) {
-                                               /* We have a good enough best 
diagonal; now insist
-                                                  that it end with a 
significant snake.  */
-                                               int k;
-
-                                               for (k = 0; xv[x + k] == yv[y + 
k]; k++) {
-                                                       if (k == SNAKE_LIMIT - 
1) {
-                                                               best = v;
-                                                               part->xmid = x;
-                                                               part->ymid = y;
-                                                               break;
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
-                       if (best > 0) {
-                               part->lo_minimal = 0;
-                               part->hi_minimal = 1;
-                               return 2 * c - 1;
-                       }
-               }
-#else
-               (void) big_snake;
-#endif /* MINUS_H_FLAG */
-
                /* Heuristic: if we've gone well beyond the call of duty, give 
up
                   and report halfway between our best results so far.  */
                if (c >= too_expensive) {
@@ -729,10 +601,10 @@ diag(int xoff, int xlim, int yoff, int y
        If MINIMAL is nonzero, find a minimal difference no matter how
        expensive it is.  */
 
-static void compareseq PARAMS((int, int, int, int, int));
+static void compareseq PARAMS((int, int, int, int, int, int, int, struct 
string_data *, int*, int*));
 
 static void
-compareseq(int xoff, int xlim, int yoff, int ylim, int minimal)
+compareseq(int xoff, int xlim, int yoff, int ylim, int minimal, int max_edits, 
int too_expensive, struct string_data *string, int *fdiag, int *bdiag) /* 
compareseq stops when edits > max_edits */
 {
        const char *const xv = string[0].data;  /* Help the compiler.  */
        const char *const yv = string[1].data;
@@ -768,26 +640,18 @@ compareseq(int xoff, int xlim, int yoff,
                struct partition part;
 
                /* Find a point of correspondence in the middle of the strings. 
 */
-               c = diag(xoff, xlim, yoff, ylim, minimal, &part);
+               c = diag(xoff, xlim, yoff, ylim, minimal, &part, too_expensive, 
string, fdiag, bdiag);
                if (c == 1) {
-#if 0
-                       /* This should be impossible, because it implies that 
one of
-                          the two subsequences is empty, and that case was 
handled
-                          above without calling `diag'.  Let's verify that 
this is
-                          true.  */
-                       abort();
-#else
                        /* The two subsequences differ by a single insert or 
delete;
                           record it and we are done.  */
                        if (part.xmid - part.ymid < xoff - yoff)
                                ++string[1].edit_count;
                        else
                                ++string[0].edit_count;
-#endif
                } else {
                        /* Use the partitions to split this problem into 
subproblems.  */
-                       compareseq(xoff, part.xmid, yoff, part.ymid, 
part.lo_minimal);
-                       compareseq(part.xmid, xlim, part.ymid, ylim, 
part.hi_minimal);
+                       compareseq(xoff, part.xmid, yoff, part.ymid, 
part.lo_minimal, max_edits, too_expensive, string, fdiag, bdiag);
+                       compareseq(part.xmid, xlim, part.ymid, ylim, 
part.hi_minimal, max_edits, too_expensive, string, fdiag, bdiag);
                }
        }
 }
@@ -810,20 +674,11 @@ compareseq(int xoff, int xlim, int yoff,
        similar.  */
 
 static str
-fstrcmp_impl(dbl *ret, str *S1, str *S2, dbl *minimum)
+fstrcmp_impl_internal(dbl *ret, str string1, str string2, dbl minimum)
 {
-       char *string1 = *S1;
-       char *string2 = *S2;
-       int i;
-
+       int i, max_edits, *fdiag, *bdiag, *fdiag_buf = NULL, too_expensive = 1;
        size_t fdiag_len;
-       static int *fdiag_buf;
-       static size_t fdiag_max;
-
-       if (strNil(*S1) || strNil(*S2) || is_dbl_nil(*minimum)) {
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to