This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 5adcddb87a0 [SPARK-39695][SQL] Add the `REGEXP_SUBSTR` function 5adcddb87a0 is described below commit 5adcddb87a052ce8e3b3c917c61f019bea5532ae Author: Max Gekk <max.g...@gmail.com> AuthorDate: Thu Jul 7 11:22:41 2022 +0300 [SPARK-39695][SQL] Add the `REGEXP_SUBSTR` function ### What changes were proposed in this pull request? In the PR, I propose to add new expression `RegExpSubStr` as a runtime replaceable expression of `NullIf` and `RegExpExtract`. And bind the expression to the function name `REGEXP_SUBSTR`. The `REGEXP_SUBSTR` function returns the substring that matches a regular expression within a string. It takes two parameters: 1. An expression that specifies the string in which the search is to take place. 2. An expression that specifies the regular expression string that is the pattern for the search. If the regular expression is not found, the result is **null** (this behaviour is similar to other DBMSs). When any of the input parameters are NULL, the function returns NULL too. For example: ```sql spark-sql> CREATE TABLE log (logs string); spark-sql> INSERT INTO log (logs) VALUES > ('127.0.0.1 - - [10/Jan/2022:16:55:36 -0800] "GET / HTTP/1.0" 200 2217'), > ('192.168.1.99 - - [14/Feb/2022:10:27:10 -0800] "GET /cgi-bin/try/ HTTP/1.0" 200 3396'); spark-sql> SELECT REGEXP_SUBSTR (logs,'\\b\\d{1,3}\.\\d{1,3}\.\\d{1,3}\.\\d{1,3}\\b') AS IP, REGEXP_SUBSTR (logs,'([\\w:\/]+\\s[+\-]\\d{4})') AS DATE FROM log; 127.0.0.1 10/Jan/2022:16:55:36 -0800 192.168.1.99 14/Feb/2022:10:27:10 -0800 ``` ### Why are the changes needed? To make the migration process from other systems to Spark SQL easier, and achieve feature parity to such systems. For example, the systems below support the `REGEXP_SUBSTR` function, see: - Oracle: https://docs.oracle.com/cd/B12037_01/server.101/b10759/functions116.htm - DB2: https://www.ibm.com/docs/en/db2/11.5?topic=functions-regexp-substr - Snowflake: https://docs.snowflake.com/en/sql-reference/functions/regexp_substr.html - BigQuery: https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_substr - Redshift: https://docs.aws.amazon.com/redshift/latest/dg/REGEXP_SUBSTR.html - MariaDB: https://mariadb.com/kb/en/regexp_substr/ - Exasol DB: https://docs.exasol.com/db/latest/sql_references/functions/alphabeticallistfunctions/regexp_substr.htm ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new tests: ``` $ build/sbt "sql/testOnly org.apache.spark.sql.SQLQueryTestSuite -- -z regexp-functions.sql" $ build/sbt "sql/testOnly *ExpressionsSchemaSuite" $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite" ``` Closes #37101 from MaxGekk/regexp_substr. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../catalyst/expressions/regexpExpressions.scala | 39 +++++++++++++++ .../sql-functions/sql-expression-schema.md | 1 + .../sql-tests/inputs/regexp-functions.sql | 9 ++++ .../sql-tests/results/regexp-functions.sql.out | 56 ++++++++++++++++++++++ 5 files changed, 106 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 52d84cfa175..20c719aec68 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -585,6 +585,7 @@ object FunctionRegistry { expression[XPathShort]("xpath_short"), expression[XPathString]("xpath_string"), expression[RegExpCount]("regexp_count"), + expression[RegExpSubStr]("regexp_substr"), // datetime functions expression[AddMonths]("add_months"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 8d813058296..b240e849f4d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -1004,3 +1004,42 @@ case class RegExpCount(left: Expression, right: Expression) newChildren: IndexedSeq[Expression]): RegExpCount = copy(left = newChildren(0), right = newChildren(1)) } + +// scalastyle:off line.size.limit +@ExpressionDescription( + usage = """ + _FUNC_(str, regexp) - Returns the substring that matches the regular expression `regexp` within the string `str`. If the regular expression is not found, the result is null. + """, + arguments = """ + Arguments: + * str - a string expression. + * regexp - a string representing a regular expression. The regex string should be a Java regular expression. + """, + examples = """ + Examples: + > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en'); + Steven + > SELECT _FUNC_('Steven Jones and Stephen Smith are the best players', 'Jeck'); + NULL + """, + since = "3.4.0", + group = "string_funcs") +// scalastyle:on line.size.limit +case class RegExpSubStr(left: Expression, right: Expression) + extends RuntimeReplaceable with ImplicitCastInputTypes { + + override lazy val replacement: Expression = + new NullIf( + RegExpExtract(subject = left, regexp = right, idx = Literal(0)), + Literal("")) + + override def prettyName: String = "regexp_substr" + + override def children: Seq[Expression] = Seq(left, right) + + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + + override protected def withNewChildrenInternal( + newChildren: IndexedSeq[Expression]): RegExpSubStr = + copy(left = newChildren(0), right = newChildren(1)) +} diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index e24ae5d0f7b..0305781a48d 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -241,6 +241,7 @@ | org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract | SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) | struct<regexp_extract(100-200, (\d+)-(\d+), 1):string> | | org.apache.spark.sql.catalyst.expressions.RegExpExtractAll | regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400', '(\\d+)-(\\d+)', 1) | struct<regexp_extract_all(100-200, 300-400, (\d+)-(\d+), 1):array<string>> | | org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace | SELECT regexp_replace('100-200', '(\\d+)', 'num') | struct<regexp_replace(100-200, (\d+), num, 1):string> | +| org.apache.spark.sql.catalyst.expressions.RegExpSubStr | regexp_substr | SELECT regexp_substr('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en') | struct<regexp_substr(Steven Jones and Stephen Smith are the best players, Ste(v|ph)en):string> | | org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 | struct<(2 % 1.8):decimal(2,1)> | | org.apache.spark.sql.catalyst.expressions.Remainder | mod | SELECT 2 % 1.8 | struct<(2 % 1.8):decimal(2,1)> | | org.apache.spark.sql.catalyst.expressions.Reverse | reverse | SELECT reverse('Spark SQL') | struct<reverse(Spark SQL):string> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql index a48cfe68489..d828d761b77 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -60,3 +60,12 @@ SELECT regexp_count('the fox', '(?i)FOX'); SELECT regexp_count('passwd7 plain A1234 a1234', '(?=[^ ]*[a-z])(?=[^ ]*[0-9])[^ ]+'); SELECT regexp_count(null, 'abc'); SELECT regexp_count('abc', null); + +-- regexp_substr +SELECT regexp_substr('1a 2b 14m', '\\d+'); +SELECT regexp_substr('1a 2b 14m', '\\d+ '); +SELECT regexp_substr('1a 2b 14m', '\\d+(a|b|m)'); +SELECT regexp_substr('1a 2b 14m', '\\d{2}(a|b|m)'); +SELECT regexp_substr('1a 2b 14m', ''); +SELECT regexp_substr('Spark', null); +SELECT regexp_substr(null, '.*'); diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out index a2eb2b2a148..c82b892f481 100644 --- a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -435,3 +435,59 @@ SELECT regexp_count('abc', null) struct<regexp_count(abc, NULL):int> -- !query output NULL + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d+') +-- !query schema +struct<regexp_substr(1a 2b 14m, \d+):string> +-- !query output +1 + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d+ ') +-- !query schema +struct<regexp_substr(1a 2b 14m, \d+ ):string> +-- !query output +NULL + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d+(a|b|m)') +-- !query schema +struct<regexp_substr(1a 2b 14m, \d+(a|b|m)):string> +-- !query output +1a + + +-- !query +SELECT regexp_substr('1a 2b 14m', '\\d{2}(a|b|m)') +-- !query schema +struct<regexp_substr(1a 2b 14m, \d{2}(a|b|m)):string> +-- !query output +14m + + +-- !query +SELECT regexp_substr('1a 2b 14m', '') +-- !query schema +struct<regexp_substr(1a 2b 14m, ):string> +-- !query output +NULL + + +-- !query +SELECT regexp_substr('Spark', null) +-- !query schema +struct<regexp_substr(Spark, NULL):string> +-- !query output +NULL + + +-- !query +SELECT regexp_substr(null, '.*') +-- !query schema +struct<regexp_substr(NULL, .*):string> +-- !query output +NULL --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org