Changeset: fd44dd6cb5fc for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/fd44dd6cb5fc Modified Files: clients/Tests/MAL-signatures-hge.test clients/Tests/MAL-signatures.test clients/Tests/exports.stable.out monetdb5/modules/mal/txtsim.c Branch: txtsim Log Message:
approved output (tiny fix on interface mismatch (dbl vs double)) diffs (254 lines): diff --git a/clients/Tests/MAL-signatures-hge.test b/clients/Tests/MAL-signatures-hge.test --- a/clients/Tests/MAL-signatures-hge.test +++ b/clients/Tests/MAL-signatures-hge.test @@ -35447,7 +35447,7 @@ battxtsim similarity command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl] fstrcmp0_impl_bulk; -Normalized edit distance between two strings +(Deprecated) Normalized edit distance between two strings baturl extractURLHost command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str] @@ -50879,60 +50879,80 @@ pattern tokenizer.take(X_0:oid):str TKNZRtakeOid; reconstruct and returns the i-th string txtsim +damerau_levenshtein +pattern txtsim.damerau_levenshtein(X_0:str, X_1:str):int +damerau_levenshtein_distance; +Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2) +txtsim +damerau_levenshtein +pattern txtsim.damerau_levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int +damerau_levenshtein_distance; +Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition) +txtsim editdistance command txtsim.editdistance(X_0:str, X_1:str):int -levenshteinbasic_impl; -Alias for Levenshtein(str,str) +damerau_levenshtein1; +Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2 txtsim editdistance2 command txtsim.editdistance2(X_0:str, X_1:str):int -levenshteinbasic2_impl; -Calculates Levenshtein distance (edit distance) between two strings. Cost of transposition is 1 instead of 2 +damerau_levenshtein2; +Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1 +txtsim +jaro_winkler_similarity +command txtsim.jaro_winkler_similarity(X_0:str, X_1:str):dbl +jaro_winkler_similarity; +Calculate Jaro Winkler similarity txtsim levenshtein -command txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int -levenshtein_impl; -Calculates Levenshtein distance (edit distance) between two strings, variable operation costs (ins/del, replacement, transposition) +pattern txtsim.levenshtein(X_0:str, X_1:str):int +levenshtein_distance; +Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1) txtsim levenshtein -command txtsim.levenshtein(X_0:str, X_1:str):int -levenshteinbasic_impl; -Calculates Levenshtein distance (edit distance) between two strings +pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int):int +levenshtein_distance; +Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement) +txtsim +levenshtein +pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int +levenshtein_distance; +(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition) txtsim qgramnormalize command txtsim.qgramnormalize(X_0:str):str -CMDqgramnormalize; +qgram_normalize; 'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space txtsim qgramselfjoin command txtsim.qgramselfjoin(X_0:bat[:oid], X_1:bat[:oid], X_2:bat[:int], X_3:bat[:int], X_4:flt, X_5:int) (X_6:bat[:int], X_7:bat[:int]) -CMDqgramselfjoin; +qgram_selfjoin; QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions txtsim similarity command txtsim.similarity(X_0:str, X_1:str):dbl fstrcmp0_impl; -Normalized edit distance between two strings +(Deprecated) Normalized edit distance between two strings txtsim similarity command txtsim.similarity(X_0:str, X_1:str, X_2:dbl):dbl fstrcmp_impl; -Normalized edit distance between two strings +(Deprecated) Normalized edit distance between two strings txtsim soundex command txtsim.soundex(X_0:str):str -soundex_impl; +soundex; Soundex function for phonetic matching txtsim str2qgrams command txtsim.str2qgrams(X_0:str):bat[:str] -CMDstr2qgrams; +str_2_qgrams; Break the string into 4-grams txtsim stringdiff command txtsim.stringdiff(X_0:str, X_1:str):int -stringdiff_impl; -calculate the soundexed editdistance +stringdiff; +Calculate the soundexed editdistance url extractURLHost command url.extractURLHost(X_0:str, X_1:bit):str diff --git a/clients/Tests/MAL-signatures.test b/clients/Tests/MAL-signatures.test --- a/clients/Tests/MAL-signatures.test +++ b/clients/Tests/MAL-signatures.test @@ -26497,7 +26497,7 @@ battxtsim similarity command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl] fstrcmp0_impl_bulk; -Normalized edit distance between two strings +(Deprecated) Normalized edit distance between two strings baturl extractURLHost command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str] @@ -39204,60 +39204,80 @@ pattern tokenizer.take(X_0:oid):str TKNZRtakeOid; reconstruct and returns the i-th string txtsim +damerau_levenshtein +pattern txtsim.damerau_levenshtein(X_0:str, X_1:str):int +damerau_levenshtein_distance; +Calculates Damerau-Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1, transposition = 2) +txtsim +damerau_levenshtein +pattern txtsim.damerau_levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int +damerau_levenshtein_distance; +Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition) +txtsim editdistance command txtsim.editdistance(X_0:str, X_1:str):int -levenshteinbasic_impl; -Alias for Levenshtein(str,str) +damerau_levenshtein1; +Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 2 txtsim editdistance2 command txtsim.editdistance2(X_0:str, X_1:str):int -levenshteinbasic2_impl; -Calculates Levenshtein distance (edit distance) between two strings. Cost of transposition is 1 instead of 2 +damerau_levenshtein2; +Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and transpose = 1 +txtsim +jaro_winkler_similarity +command txtsim.jaro_winkler_similarity(X_0:str, X_1:str):dbl +jaro_winkler_similarity; +Calculate Jaro Winkler similarity txtsim levenshtein -command txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int -levenshtein_impl; -Calculates Levenshtein distance (edit distance) between two strings, variable operation costs (ins/del, replacement, transposition) +pattern txtsim.levenshtein(X_0:str, X_1:str):int +levenshtein_distance; +Calculates Levenshtein distance between two strings, operation costs (ins/del = 1, replacement = 1) txtsim levenshtein -command txtsim.levenshtein(X_0:str, X_1:str):int -levenshteinbasic_impl; -Calculates Levenshtein distance (edit distance) between two strings +pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int):int +levenshtein_distance; +Calculates Levenshtein distance between two strings, variable operation costs (ins/del, replacement) +txtsim +levenshtein +pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int +levenshtein_distance; +(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance between two strings, variable operation costs (ins/del, replacement, transposition) txtsim qgramnormalize command txtsim.qgramnormalize(X_0:str):str -CMDqgramnormalize; +qgram_normalize; 'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space txtsim qgramselfjoin command txtsim.qgramselfjoin(X_0:bat[:oid], X_1:bat[:oid], X_2:bat[:int], X_3:bat[:int], X_4:flt, X_5:int) (X_6:bat[:int], X_7:bat[:int]) -CMDqgramselfjoin; +qgram_selfjoin; QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions txtsim similarity command txtsim.similarity(X_0:str, X_1:str):dbl fstrcmp0_impl; -Normalized edit distance between two strings +(Deprecated) Normalized edit distance between two strings txtsim similarity command txtsim.similarity(X_0:str, X_1:str, X_2:dbl):dbl fstrcmp_impl; -Normalized edit distance between two strings +(Deprecated) Normalized edit distance between two strings txtsim soundex command txtsim.soundex(X_0:str):str -soundex_impl; +soundex; Soundex function for phonetic matching txtsim str2qgrams command txtsim.str2qgrams(X_0:str):bat[:str] -CMDstr2qgrams; +str_2_qgrams; Break the string into 4-grams txtsim stringdiff command txtsim.stringdiff(X_0:str, X_1:str):int -stringdiff_impl; -calculate the soundexed editdistance +stringdiff; +Calculate the soundexed editdistance url extractURLHost command url.extractURLHost(X_0:str, X_1:bit):str diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out --- a/clients/Tests/exports.stable.out +++ b/clients/Tests/exports.stable.out @@ -784,6 +784,8 @@ void TABLETdestroy_format(Tablet *as); int TABLEToutput_file(Tablet *as, BAT *order, stream *s); int TRACEtable(Client cntxt, BAT **r); int TYPE_xml; +int UTF8_strlen(const char *restrict s); +int UTF8_strwidth(const char *restrict s); void addMalException(MalBlkPtr mb, str msg); str addOptimizerPipe(Client cntxt, MalBlkPtr mb, const char *name); str addPipeDefinition(Client cntxt, const char *name, const char *pipe); @@ -1232,6 +1234,7 @@ const char *stoptraceRef; void strAfterCall(ValPtr v, ValPtr bak); void strBeforeCall(ValPtr v, ValPtr bak); const char *strRef; +int str_strlen(const char *restrict s); const char *streamsRef; const char *strimpsRef; const char *subavgRef; diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c --- a/monetdb5/modules/mal/txtsim.c +++ b/monetdb5/modules/mal/txtsim.c @@ -394,7 +394,7 @@ jaro_winkler(const str_item *x, const st } static str -jaro_winkler_similarity(double *ret, str *x, str *y) +jaro_winkler_similarity(dbl *ret, str *x, str *y) { int *x_flags = NULL, *y_flags = NULL; str_item xi = { 0 }, yi = { 0 }; _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org