This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new d557a56b956 [SPARK-39107][SQL] Account for empty string input in regex replace d557a56b956 is described below commit d557a56b956545806019ee7f13f41955d8cb107f Author: Lorenzo Martini <lmart...@palantir.com> AuthorDate: Mon May 9 19:44:19 2022 -0500 [SPARK-39107][SQL] Account for empty string input in regex replace ### What changes were proposed in this pull request? When trying to perform a regex replace, account for the possibility of having empty strings as input. ### Why are the changes needed? https://github.com/apache/spark/pull/29891 was merged to address https://issues.apache.org/jira/browse/SPARK-30796 and introduced a bug that would not allow regex matching on empty strings, as it would account for position within substring but not consider the case where input string has length 0 (empty string) From https://issues.apache.org/jira/browse/SPARK-39107 there is a change in behavior between spark versions. 3.0.2 ``` scala> val df = spark.sql("SELECT '' AS col") df: org.apache.spark.sql.DataFrame = [col: string] scala> df.withColumn("replaced", regexp_replace(col("col"), "^$", "<empty>")).show +---+--------+ |col|replaced| +---+--------+ | | <empty>| +---+--------+ ``` 3.1.2 ``` scala> val df = spark.sql("SELECT '' AS col") df: org.apache.spark.sql.DataFrame = [col: string] scala> df.withColumn("replaced", regexp_replace(col("col"), "^$", "<empty>")).show +---+--------+ |col|replaced| +---+--------+ | | | +---+--------+ ``` The 3.0.2 outcome is the expected and correct one ### Does this PR introduce _any_ user-facing change? Yes compared to spark 3.2.1, as it brings back the correct behavior when trying to regex match empty strings, as shown in the example above. ### How was this patch tested? Added special casing test in `RegexpExpressionsSuite.RegexReplace` with empty string replacement. Closes #36457 from LorenzoMartini/lmartini/fix-empty-string-replace. Authored-by: Lorenzo Martini <lmart...@palantir.com> Signed-off-by: Sean Owen <sro...@gmail.com> (cherry picked from commit 731aa2cdf8a78835621fbf3de2d3492b27711d1a) Signed-off-by: Sean Owen <sro...@gmail.com> --- .../org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 4 ++-- .../spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 6c0de3fc7ef..27013e26bdd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -537,7 +537,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } val source = s.toString() val position = i.asInstanceOf[Int] - 1 - if (position < source.length) { + if (position == 0 || position < source.length) { val m = pattern.matcher(source) m.region(position, source.length) result.delete(0, result.length()) @@ -592,7 +592,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } String $source = $subject.toString(); int $position = $pos - 1; - if ($position < $source.length()) { + if ($position == 0 || $position < $source.length()) { $classNameStringBuffer $termResult = new $classNameStringBuffer(); java.util.regex.Matcher $matcher = $termPattern.matcher($source); $matcher.region($position, $source.length()); diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 019857580d0..2ca9ede7742 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -293,6 +293,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val row4 = create_row(null, "(\\d+)", "###") val row5 = create_row("100-200", null, "###") val row6 = create_row("100-200", "(-)", null) + val row7 = create_row("", "^$", "<empty string>") val s = 's.string.at(0) val p = 'p.string.at(1) @@ -305,6 +306,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(expr, null, row4) checkEvaluation(expr, null, row5) checkEvaluation(expr, null, row6) + checkEvaluation(expr, "<empty string>", row7) // test position val exprWithPos = RegExpReplace(s, p, r, 4) checkEvaluation(exprWithPos, "100-num", row1) @@ -313,6 +315,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation(exprWithPos, null, row4) checkEvaluation(exprWithPos, null, row5) checkEvaluation(exprWithPos, null, row6) + checkEvaluation(exprWithPos, "", row7) val exprWithLargePos = RegExpReplace(s, p, r, 7) checkEvaluation(exprWithLargePos, "100-20num", row1) checkEvaluation(exprWithLargePos, "100-20###", row2) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org