This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-2.4 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-2.4 by this push: new c80b79f [SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract c80b79f is described below commit c80b79f2d4f13d1db1414f7ce28d7d054105c498 Author: beliefer <belie...@163.com> AuthorDate: Wed Feb 19 20:34:53 2020 +0800 [SPARK-30763][SQL][2.4] Fix java.lang.IndexOutOfBoundsException No group 1 for regexp_extract ### What changes were proposed in this pull request? This PR follows https://github.com/apache/spark/pull/27508 and used to spark2.4. ### Why are the changes needed? Fix a bug `java.lang.IndexOutOfBoundsException No group 1` ### Does this PR introduce any user-facing change? Yes ### How was this patch tested? New UT. Closes #27631 from beliefer/fix-2.4-regexp_extract-bug. Authored-by: beliefer <belie...@163.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../catalyst/expressions/regexpExpressions.scala | 15 ++++- .../expressions/RegexpExpressionsSuite.scala | 12 ++++ .../sql-tests/inputs/regexp-functions.sql | 9 +++ .../sql-tests/results/regexp-functions.sql.out | 69 ++++++++++++++++++++++ 4 files changed, 104 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index e80543c..7086e4d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -366,6 +366,15 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } } +object RegExpExtract { + def checkGroupIndex(groupCount: Int, groupIndex: Int): Unit = { + if (groupCount < groupIndex) { + throw new IllegalArgumentException( + s"Regex group count is $groupCount, but the specified group index is $groupIndex") + } + } +} + /** * Extract a specific(idx) group identified by a Java regex. * @@ -397,7 +406,9 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio val m = pattern.matcher(s.toString) if (m.find) { val mr: MatchResult = m.toMatchResult - val group = mr.group(r.asInstanceOf[Int]) + val index = r.asInstanceOf[Int] + RegExpExtract.checkGroupIndex(mr.groupCount, index) + val group = mr.group(index) if (group == null) { // Pattern matched, but not optional group UTF8String.EMPTY_UTF8 } else { @@ -415,6 +426,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val classNamePattern = classOf[Pattern].getCanonicalName + val classNameRegExpExtract = classOf[RegExpExtract].getCanonicalName val matcher = ctx.freshName("matcher") val matchResult = ctx.freshName("matchResult") @@ -438,6 +450,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio $termPattern.matcher($subject.toString()); if ($matcher.find()) { java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); + $classNameRegExpExtract.checkGroupIndex($matchResult.groupCount(), $idx); if ($matchResult.group($idx) == null) { ${ev.value} = UTF8String.EMPTY_UTF8; } else { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index d532dc4..4c7a037 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -215,6 +215,18 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val nonNullExpr = RegExpExtract(Literal("100-200"), Literal("(\\d+)-(\\d+)"), Literal(1)) checkEvaluation(nonNullExpr, "100", row1) + + // invalid group index + val row8 = create_row("100-200", "(\\d+)-(\\d+)", 3) + val row9 = create_row("100-200", "(\\d+).*", 2) + val row10 = create_row("100-200", "\\d+", 1) + + checkExceptionInExpression[IllegalArgumentException]( + expr, row8, "Regex group count is 2, but the specified group index is 3") + checkExceptionInExpression[IllegalArgumentException]( + expr, row9, "Regex group count is 1, but the specified group index is 2") + checkExceptionInExpression[IllegalArgumentException]( + expr, row10, "Regex group count is 0, but the specified group index is 1") } test("SPLIT") { diff --git a/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql new file mode 100644 index 0000000..c0827a3 --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/inputs/regexp-functions.sql @@ -0,0 +1,9 @@ +-- regexp_extract +SELECT regexp_extract('1a 2b 14m', '\\d+'); +SELECT regexp_extract('1a 2b 14m', '\\d+', 0); +SELECT regexp_extract('1a 2b 14m', '\\d+', 1); +SELECT regexp_extract('1a 2b 14m', '\\d+', 2); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)'); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1); +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2); diff --git a/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out new file mode 100644 index 0000000..f54f67f --- /dev/null +++ b/sql/core/src/test/resources/sql-tests/results/regexp-functions.sql.out @@ -0,0 +1,69 @@ +-- Automatically generated by SQLQueryTestSuite +-- Number of queries: 8 + + +-- !query 0 +SELECT regexp_extract('1a 2b 14m', '\\d+') +-- !query 0 schema +struct<> +-- !query 0 output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query 1 +SELECT regexp_extract('1a 2b 14m', '\\d+', 0) +-- !query 1 schema +struct<regexp_extract(1a 2b 14m, \d+, 0):string> +-- !query 1 output +1 + + +-- !query 2 +SELECT regexp_extract('1a 2b 14m', '\\d+', 1) +-- !query 2 schema +struct<> +-- !query 2 output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 1 + + +-- !query 3 +SELECT regexp_extract('1a 2b 14m', '\\d+', 2) +-- !query 3 schema +struct<> +-- !query 3 output +java.lang.IllegalArgumentException +Regex group count is 0, but the specified group index is 2 + + +-- !query 4 +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)') +-- !query 4 schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string> +-- !query 4 output +1 + + +-- !query 5 +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 0) +-- !query 5 schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 0):string> +-- !query 5 output +1a + + +-- !query 6 +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 1) +-- !query 6 schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 1):string> +-- !query 6 output +1 + + +-- !query 7 +SELECT regexp_extract('1a 2b 14m', '(\\d+)([a-z]+)', 2) +-- !query 7 schema +struct<regexp_extract(1a 2b 14m, (\d+)([a-z]+), 2):string> +-- !query 7 output +a --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org