This is an automated email from the ASF dual-hosted git repository.

xiong pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/calcite.git


The following commit(s) were added to refs/heads/main by this push:
     new 132cc3df5c [CALCITE-5851] Add LEVENSHTEIN function (enabled in Hive 
and Spark library)
132cc3df5c is described below

commit 132cc3df5c3248840e175bc6687718304177a91d
Author: Runkang He <hrun...@gmail.com>
AuthorDate: Thu Jul 20 17:54:05 2023 +0800

    [CALCITE-5851] Add LEVENSHTEIN function (enabled in Hive and Spark library)
---
 bom/build.gradle.kts                                  |  1 +
 core/build.gradle.kts                                 |  1 +
 .../calcite/adapter/enumerable/RexImpTable.java       |  2 ++
 .../java/org/apache/calcite/runtime/SqlFunctions.java |  9 +++++++++
 .../apache/calcite/sql/fun/SqlLibraryOperators.java   |  8 ++++++++
 .../java/org/apache/calcite/util/BuiltInMethod.java   |  1 +
 gradle.properties                                     |  1 +
 site/_docs/reference.md                               |  1 +
 .../java/org/apache/calcite/test/SqlOperatorTest.java | 19 +++++++++++++++++++
 9 files changed, 43 insertions(+)

diff --git a/bom/build.gradle.kts b/bom/build.gradle.kts
index 10ac5b06ec..e4b3a82aa2 100644
--- a/bom/build.gradle.kts
+++ b/bom/build.gradle.kts
@@ -101,6 +101,7 @@ dependencies {
         apiv("org.apache.commons:commons-math3")
         apiv("org.apache.commons:commons-pool2")
         apiv("org.apache.commons:commons-collections4")
+        apiv("org.apache.commons:commons-text")
         apiv("org.apache.geode:geode-core")
         apiv("org.apache.hadoop:hadoop-client", "hadoop")
         apiv("org.apache.hadoop:hadoop-common", "hadoop")
diff --git a/core/build.gradle.kts b/core/build.gradle.kts
index a447f25c48..1aa45832c1 100644
--- a/core/build.gradle.kts
+++ b/core/build.gradle.kts
@@ -71,6 +71,7 @@ dependencies {
     implementation("org.apache.commons:commons-dbcp2")
     implementation("org.apache.commons:commons-lang3")
     implementation("org.apache.commons:commons-math3")
+    implementation("org.apache.commons:commons-text")
     implementation("commons-io:commons-io")
     implementation("org.codehaus.janino:commons-compiler")
     implementation("org.codehaus.janino:janino")
diff --git 
a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java 
b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
index 43242677b6..ec3e881262 100644
--- a/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
+++ b/core/src/main/java/org/apache/calcite/adapter/enumerable/RexImpTable.java
@@ -190,6 +190,7 @@ import static 
org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_SET;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_STORAGE_SIZE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.JSON_TYPE;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.LEFT;
+import static org.apache.calcite.sql.fun.SqlLibraryOperators.LEVENSHTEIN;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.LOG;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.LOGICAL_AND;
 import static org.apache.calcite.sql.fun.SqlLibraryOperators.LOGICAL_OR;
@@ -542,6 +543,7 @@ public class RexImpTable {
       defineMethod(SOUNDEX, BuiltInMethod.SOUNDEX.method, NullPolicy.STRICT);
       defineMethod(DIFFERENCE, BuiltInMethod.DIFFERENCE.method, 
NullPolicy.STRICT);
       defineMethod(REVERSE, BuiltInMethod.REVERSE.method, NullPolicy.STRICT);
+      defineMethod(LEVENSHTEIN, BuiltInMethod.LEVENSHTEIN.method, 
NullPolicy.STRICT);
       defineMethod(SPLIT, "split", NullPolicy.STRICT);
 
       map.put(TRIM, new TrimImplementor());
diff --git a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java 
b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
index 69834c84f6..4c0d257f1f 100644
--- a/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
+++ b/core/src/main/java/org/apache/calcite/runtime/SqlFunctions.java
@@ -50,6 +50,7 @@ import org.apache.commons.codec.binary.Base32;
 import org.apache.commons.codec.binary.Hex;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.codec.language.Soundex;
+import org.apache.commons.text.similarity.LevenshteinDistance;
 
 import com.google.common.base.Splitter;
 import com.google.common.base.Strings;
@@ -143,6 +144,9 @@ public class SqlFunctions {
 
   private static final int SOUNDEX_LENGTH = 4;
 
+  private static final LevenshteinDistance LEVENSHTEIN_DISTANCE =
+      LevenshteinDistance.getDefaultInstance();
+
   private static final Pattern FROM_BASE64_REGEXP = 
Pattern.compile("[\\t\\n\\r\\s]");
 
   private static final Base32 BASE_32 = new Base32();
@@ -688,6 +692,11 @@ public class SqlFunctions {
     return buf.reverse().toString();
   }
 
+  /** SQL LEVENSHTEIN(string1, string2) function. */
+  public static int levenshtein(String string1, String string2) {
+    return LEVENSHTEIN_DISTANCE.apply(string1, string2);
+  }
+
   /** SQL ASCII(string) function. */
   public static int ascii(String s) {
     return s.isEmpty()
diff --git 
a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java 
b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
index 18abd19975..520c34ac01 100644
--- a/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
+++ b/core/src/main/java/org/apache/calcite/sql/fun/SqlLibraryOperators.java
@@ -1267,6 +1267,14 @@ public abstract class SqlLibraryOperators {
           OperandTypes.CHARACTER)
           .withFunctionType(SqlFunctionCategory.STRING);
 
+  /** The "LEVENSHTEIN(string1, string2)" function. */
+  @LibraryOperator(libraries = {HIVE, SPARK})
+  public static final SqlFunction LEVENSHTEIN =
+      SqlBasicFunction.create("LEVENSHTEIN",
+          ReturnTypes.INTEGER_NULLABLE,
+          OperandTypes.STRING_STRING,
+          SqlFunctionCategory.STRING);
+
   @LibraryOperator(libraries = {BIG_QUERY, MYSQL})
   public static final SqlFunction FROM_BASE64 =
       SqlBasicFunction.create("FROM_BASE64",
diff --git a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java 
b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
index b02870f351..e7ec9124b9 100644
--- a/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
+++ b/core/src/main/java/org/apache/calcite/util/BuiltInMethod.java
@@ -360,6 +360,7 @@ public enum BuiltInMethod {
   STRCMP(SqlFunctions.class, "strcmp", String.class, String.class),
   DIFFERENCE(SqlFunctions.class, "difference", String.class, String.class),
   REVERSE(SqlFunctions.class, "reverse", String.class),
+  LEVENSHTEIN(SqlFunctions.class, "levenshtein", String.class, String.class),
   LEFT(SqlFunctions.class, "left", String.class, int.class),
   RIGHT(SqlFunctions.class, "right", String.class, int.class),
   TO_BASE64(SqlFunctions.class, "toBase64", String.class),
diff --git a/gradle.properties b/gradle.properties
index 6bf7cff53e..e99adc707c 100644
--- a/gradle.properties
+++ b/gradle.properties
@@ -97,6 +97,7 @@ commons-lang3.version=3.8
 commons-math3.version=3.6.1
 commons-pool2.version=2.6.2
 commons-collections4.version=4.4
+commons-text.version=1.10.0
 dropwizard-metrics.version=4.0.5
 
 # do not upgrade this, new versions are Category X license.
diff --git a/site/_docs/reference.md b/site/_docs/reference.md
index 12f6467fc1..75060150df 100644
--- a/site/_docs/reference.md
+++ b/site/_docs/reference.md
@@ -2748,6 +2748,7 @@ BigQuery's type system uses confusingly different names 
for types and functions:
 | b o | LEAST(expr [, expr ]* )                      | Returns the least of 
the expressions
 | b m p | LEFT(string, length)                       | Returns the leftmost 
*length* characters from the *string*
 | b | LENGTH(string)                                 | Equivalent to 
`CHAR_LENGTH(string)`
+| h s | LEVENSHTEIN(string1, string2)                | Returns the Levenshtein 
distance between *string1* and *string2*
 | b | LOG(numeric1 [, numeric2 ])                    | Returns the logarithm 
of *numeric1* to base *numeric2*, or base e if *numeric2* is not present
 | b o | LPAD(string, length [, pattern ])            | Returns a string or 
bytes value that consists of *string* prepended to *length* with *pattern*
 | b | TO_BASE32(string)                              | Converts the *string* 
to base-32 encoded form and returns an encoded string
diff --git a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java 
b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
index 7771cf1948..e2fa1ab988 100644
--- a/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
+++ b/testkit/src/main/java/org/apache/calcite/test/SqlOperatorTest.java
@@ -4320,6 +4320,25 @@ public class SqlOperatorTest {
     f0.forEachLibrary(list(SqlLibrary.BIG_QUERY, SqlLibrary.MYSQL), consumer);
   }
 
+  @Test void testLevenshtein() {
+    final SqlOperatorFixture f0 = 
fixture().setFor(SqlLibraryOperators.LEVENSHTEIN);
+    f0.checkFails("^levenshtein('abc', 'abc')^",
+        "No match found for function signature LEVENSHTEIN\\(<CHARACTER>, 
<CHARACTER>\\)",
+        false);
+    final Consumer<SqlOperatorFixture> consumer = f -> {
+      f.checkScalar("levenshtein('', '')", 0, "INTEGER NOT NULL");
+      f.checkScalar("levenshtein('abc', 'abc')", 0, "INTEGER NOT NULL");
+      f.checkScalar("levenshtein('kitten', 'sitting')", 3, "INTEGER NOT NULL");
+      f.checkScalar("levenshtein('frog', 'fog')", 1, "INTEGER NOT NULL");
+      f.checkScalar("levenshtein(_UTF8'\u4F60\u597D', 
_UTF8'\u4F60\u5F88\u597D')",
+          1, "INTEGER NOT NULL");
+      f.checkNull("levenshtein(cast(null as varchar), 'abc')");
+      f.checkNull("levenshtein('abc', cast(null as varchar))");
+      f.checkNull("levenshtein(cast(null as varchar), cast(null as varchar))");
+    };
+    f0.forEachLibrary(list(SqlLibrary.HIVE, SqlLibrary.SPARK), consumer);
+  }
+
   @Test void testIfFunc() {
     final SqlOperatorFixture f = fixture();
     checkIf(f.withLibrary(SqlLibrary.BIG_QUERY));

Reply via email to