This is an automated email from the ASF dual-hosted git repository. xiong pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/calcite.git
The following commit(s) were added to refs/heads/main by this push: new 132cc3df5c [CALCITE-5851] Add LEVENSHTEIN function (enabled in Hive and Spark library) 132cc3df5c is described below commit 132cc3df5c3248840e175bc6687718304177a91d Author: Runkang He <hrun...@gmail.com> AuthorDate: Thu Jul 20 17:54:05 2023 +0800 [CALCITE-5851] Add LEVENSHTEIN function (enabled in Hive and Spark library) --- bom/build.gradle.kts | 1 + core/build.gradle.kts | 1 + .../calcite/adapter/enumerable/RexImpTable.java | 2 ++ .../java/org/apache/calcite/runtime/SqlFunctions.java | 9 +++++++++ .../apache/calcite/sql/fun/SqlLibraryOperators.java | 8 ++++++++ .../java/org/apache/calcite/util/BuiltInMethod.java | 1 + gradle.properties | 1 + site/_docs/reference.md | 1 + .../java/org/apache/calcite/test/SqlOperatorTest.java | 19 +++++++++++++++++++ 9 files changed, 43 insertions(+) diff --git a/bom/build.gradle.kts b/bom/build.gradle.kts index 10ac5b06ec..e4b3a82aa2 100644 --- a/bom/build.gradle.kts +++ b/bom/build.gradle.kts @@ -101,6 +101,7 @@ dependencies { apiv("org.apache.commons:commons-math3") apiv("org.apache.commons:commons-pool2") apiv("org.apache.commons:commons-collections4") + apiv("org.apache.commons:commons-text") apiv("org.apache.geode:geode-core") apiv("org.apache.hadoop:hadoop-client", "hadoop") apiv("org.apache.hadoop:hadoop-common", "hadoop") diff --git a/core/build.gradle.kts b/core/build.gradle.kts index a447f25c48..1aa45832c1 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -71,6 +71,7 @@ dependencies { implementation("org.apache.commons:commons-dbcp2") implementation("org.apache.commons:commons-lang3") implementation("org.apache.commons:commons-math3") + implementation("org.apache.commons:commons-text") implementation("commons-io:commons-io") implementation("org.codehaus.janino:commons-compiler") implementation("org.codehaus.janino:janino") diff --git a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java index 43242677b6..ec3e881262 100644 --- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java +++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java @@ -190,6 +190,7 @@ import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_SET; import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_STORAGE_SIZE; import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_TYPE; import static org.apache.calcite.sql.fun.SqlLibraryOperators.LEFT; +import static org.apache.calcite.sql.fun.SqlLibraryOperators.LEVENSHTEIN; import static org.apache.calcite.sql.fun.SqlLibraryOperators.LOG; import static org.apache.calcite.sql.fun.SqlLibraryOperators.LOGICAL_AND; import static org.apache.calcite.sql.fun.SqlLibraryOperators.LOGICAL_OR; @@ -542,6 +543,7 @@ public class RexImpTable { defineMethod(SOUNDEX, BuiltInMethod.SOUNDEX.method, NullPolicy.STRICT); defineMethod(DIFFERENCE, BuiltInMethod.DIFFERENCE.method, NullPolicy.STRICT); defineMethod(REVERSE, BuiltInMethod.REVERSE.method, NullPolicy.STRICT); + defineMethod(LEVENSHTEIN, BuiltInMethod.LEVENSHTEIN.method, NullPolicy.STRICT); defineMethod(SPLIT, "split", NullPolicy.STRICT); map.put(TRIM, new TrimImplementor()); diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java index 69834c84f6..4c0d257f1f 100644 --- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java +++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java @@ -50,6 +50,7 @@ import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.language.Soundex; +import org.apache.commons.text.similarity.LevenshteinDistance; import com.google.common.base.Splitter; import com.google.common.base.Strings; @@ -143,6 +144,9 @@ public class SqlFunctions { private static final int SOUNDEX_LENGTH = 4; + private static final LevenshteinDistance LEVENSHTEIN_DISTANCE = + LevenshteinDistance.getDefaultInstance(); + private static final Pattern FROM_BASE64_REGEXP = Pattern.compile("[\\t\\n\\r\\s]"); private static final Base32 BASE_32 = new Base32(); @@ -688,6 +692,11 @@ public class SqlFunctions { return buf.reverse().toString(); } + /** SQL LEVENSHTEIN(string1, string2) function. */ + public static int levenshtein(String string1, String string2) { + return LEVENSHTEIN_DISTANCE.apply(string1, string2); + } + /** SQL ASCII(string) function. */ public static int ascii(String s) { return s.isEmpty() diff --git a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java index 18abd19975..520c34ac01 100644 --- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java +++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java @@ -1267,6 +1267,14 @@ public abstract class SqlLibraryOperators { OperandTypes.CHARACTER) .withFunctionType(SqlFunctionCategory.STRING); + /** The "LEVENSHTEIN(string1, string2)" function. */ + @LibraryOperator(libraries = {HIVE, SPARK}) + public static final SqlFunction LEVENSHTEIN = + SqlBasicFunction.create("LEVENSHTEIN", + ReturnTypes.INTEGER_NULLABLE, + OperandTypes.STRING_STRING, + SqlFunctionCategory.STRING); + @LibraryOperator(libraries = {BIG_QUERY, MYSQL}) public static final SqlFunction FROM_BASE64 = SqlBasicFunction.create("FROM_BASE64", diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java index b02870f351..e7ec9124b9 100644 --- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java +++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java @@ -360,6 +360,7 @@ public enum BuiltInMethod { STRCMP(SqlFunctions.class, "strcmp", String.class, String.class), DIFFERENCE(SqlFunctions.class, "difference", String.class, String.class), REVERSE(SqlFunctions.class, "reverse", String.class), + LEVENSHTEIN(SqlFunctions.class, "levenshtein", String.class, String.class), LEFT(SqlFunctions.class, "left", String.class, int.class), RIGHT(SqlFunctions.class, "right", String.class, int.class), TO_BASE64(SqlFunctions.class, "toBase64", String.class), diff --git a/gradle.properties b/gradle.properties index 6bf7cff53e..e99adc707c 100644 --- a/gradle.properties +++ b/gradle.properties @@ -97,6 +97,7 @@ commons-lang3.version=3.8 commons-math3.version=3.6.1 commons-pool2.version=2.6.2 commons-collections4.version=4.4 +commons-text.version=1.10.0 dropwizard-metrics.version=4.0.5 # do not upgrade this, new versions are Category X license. diff --git a/site/_docs/reference.md b/site/_docs/reference.md index 12f6467fc1..75060150df 100644 --- a/site/_docs/reference.md +++ b/site/_docs/reference.md @@ -2748,6 +2748,7 @@ BigQuery's type system uses confusingly different names for types and functions: | b o | LEAST(expr [, expr ]* ) | Returns the least of the expressions | b m p | LEFT(string, length) | Returns the leftmost *length* characters from the *string* | b | LENGTH(string) | Equivalent to `CHAR_LENGTH(string)` +| h s | LEVENSHTEIN(string1, string2) | Returns the Levenshtein distance between *string1* and *string2* | b | LOG(numeric1 [, numeric2 ]) | Returns the logarithm of *numeric1* to base *numeric2*, or base e if *numeric2* is not present | b o | LPAD(string, length [, pattern ]) | Returns a string or bytes value that consists of *string* prepended to *length* with *pattern* | b | TO_BASE32(string) | Converts the *string* to base-32 encoded form and returns an encoded string diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java index 7771cf1948..e2fa1ab988 100644 --- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java +++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java @@ -4320,6 +4320,25 @@ public class SqlOperatorTest { f0.forEachLibrary(list(SqlLibrary.BIG_QUERY, SqlLibrary.MYSQL), consumer); } + @Test void testLevenshtein() { + final SqlOperatorFixture f0 = fixture().setFor(SqlLibraryOperators.LEVENSHTEIN); + f0.checkFails("^levenshtein('abc', 'abc')^", + "No match found for function signature LEVENSHTEIN\\(<CHARACTER>, <CHARACTER>\\)", + false); + final Consumer<SqlOperatorFixture> consumer = f -> { + f.checkScalar("levenshtein('', '')", 0, "INTEGER NOT NULL"); + f.checkScalar("levenshtein('abc', 'abc')", 0, "INTEGER NOT NULL"); + f.checkScalar("levenshtein('kitten', 'sitting')", 3, "INTEGER NOT NULL"); + f.checkScalar("levenshtein('frog', 'fog')", 1, "INTEGER NOT NULL"); + f.checkScalar("levenshtein(_UTF8'\u4F60\u597D', _UTF8'\u4F60\u5F88\u597D')", + 1, "INTEGER NOT NULL"); + f.checkNull("levenshtein(cast(null as varchar), 'abc')"); + f.checkNull("levenshtein('abc', cast(null as varchar))"); + f.checkNull("levenshtein(cast(null as varchar), cast(null as varchar))"); + }; + f0.forEachLibrary(list(SqlLibrary.HIVE, SqlLibrary.SPARK), consumer); + } + @Test void testIfFunc() { final SqlOperatorFixture f = fixture(); checkIf(f.withLibrary(SqlLibrary.BIG_QUERY));