This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 9666cf1 [SPARK-30763][SQL] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract 9666cf1 is described below commit 9666cf10aac873ad17e95074da5e1a78c8b64b47 Author: beliefer <belie...@163.com> AuthorDate: Wed Feb 12 14:49:22 2020 +0800 [SPARK-30763][SQL] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract ### What changes were proposed in this pull request? The current implement of `regexp_extract` will throws a unprocessed exception show below: `SELECT regexp_extract('1a 2b 14m', 'd+')` ``` java.lang.IndexOutOfBoundsException: No group 1 [info] at java.util.regex.Matcher.group(Matcher.java:538) [info] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) [info] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) [info] at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:729) ``` I think should treat this exception well. ### Why are the changes needed? Fix a bug `java.lang.IndexOutOfBoundsException No group 1 ` ### Does this PR introduce any user-facing change? Yes ### How was this patch tested? New UT Closes #27508 from beliefer/fix-regexp_extract-bug. Authored-by: beliefer <belie...@163.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../catalyst/expressions/regexpExpressions.scala | 15 ++++- .../expressions/RegexpExpressionsSuite.scala | 12 ++++ .../sql-tests/inputs/regexp-functions.sql | 9 +++ .../sql-tests/results/regexp-functions.sql.out | 69 ++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index ac620b1..7891c28 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -409,6 +409,15 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } } +object RegExpExtract { + def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = { + if (groupCount < groupIndex) { + throw new IllegalArgumentException( + s"Regex group count is $groupCount, but the specified group index is $groupIndex") + } + } +} + /** * Extract a specific(idx) group identified by a Java regex. * @@ -440,7 +449,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio val m = pattern.matcher(s.toString) if (m.find) { val mr: MatchResult = m.toMatchResult - val group = mr.group(r.asInstanceOf[Int]) + val index = r.asInstanceOf[Int] + RegExpExtract.checkGroupIndex(mr.groupCount, index) + val group = mr.group(index) if (group == null) { // Pattern matched, but not optional group UTF8String.EMPTY_UTF8 } else { @@ -458,6 +469,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val classNamePattern = classOf[Pattern].getCanonicalName + val classNameRegExpExtract = classOf[RegExpExtract].getCanonicalName val matcher = ctx.freshName("matcher") val matchResult = ctx.freshName("matchResult") @@ -481,6 +493,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio $termPattern.matcher($subject.toString()); if ($matcher.find()) { java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); + $classNameRegExpExtract.checkGroupIndex($matchResult.groupCount(), $idx); if ($matchResult.group($idx) == null) { ${ev.value} = UTF8String.EMPTY_UTF8; } else { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 2c8794f..86da62b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -293,6 +293,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1)) checkEvaluation(nonNullExpr, "100", row1) + + // invalid group index + val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3) + val row9 = create_row("100-200", "(\\d+).*", 2) + val row10 = create_row("100-200", "\\d+", 1) + + checkExceptionInExpression[IllegalArgumentException]( + expr, row8, "Regex group count is 2, but the specified group index is 3") + checkExceptionInExpression[IllegalArgumentException]( + expr, row9, "Regex group count is 1, but the specified group index is 2") + checkExceptionInExpression[IllegalArgumentException]( + expr, row10, "Regex group count is 0, but the specified group index is 1") } test("SPLIT") { diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql new file mode 100644 index 0000000..c0827a3 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -0,0 +1,9 @@ +-- regexp_extract +SELECT regexp_extract('1a 2b 14m', '\\d+'); +SELECT regexp_extract('1a 2b 14m', '\\d+', 0); +SELECT regexp_extract('1a 2b 14m', '\\d+', 1); +SELECT regexp_extract('1a 2b 14m', '\\d+', 2); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)'); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2); diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out new file mode 100644 index 0000000..c92c1dd --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -0,0 +1,69 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 8 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+') +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 0) +-- !query schema +struct<regexp_extract(1a 2b 14m, \d+, 0):string> +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 1) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '\\d+', 2) +-- !query schema +struct<> +-- !query output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 2 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)') +-- !query schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string> +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0) +-- !query schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 0):string> +-- !query output +1a + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1) +-- !query schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string> +-- !query output +1 + + +-- !query +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2) +-- !query schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string> +-- !query output +a --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org