This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new d557a56b956 [SPARK-39107][SQL] Account for empty string input in regex 
replace
d557a56b956 is described below

commit d557a56b956545806019ee7f13f41955d8cb107f
Author: Lorenzo Martini <lmart...@palantir.com>
AuthorDate: Mon May 9 19:44:19 2022 -0500

    [SPARK-39107][SQL] Account for empty string input in regex replace
    
    ### What changes were proposed in this pull request?
    
    When trying to perform a regex replace, account for the possibility of 
having empty strings as input.
    
    ### Why are the changes needed?
    
    https://github.com/apache/spark/pull/29891 was merged to address 
https://issues.apache.org/jira/browse/SPARK-30796 and introduced a bug that 
would not allow regex matching on empty strings, as it would account for 
position within substring but not consider the case where input string has 
length 0 (empty string)
    
    From https://issues.apache.org/jira/browse/SPARK-39107 there is a change in 
behavior between spark versions.
    3.0.2
    ```
    scala> val df = spark.sql("SELECT '' AS col")
    df: org.apache.spark.sql.DataFrame = [col: string]
    
    scala> df.withColumn("replaced", regexp_replace(col("col"), "^$", 
"<empty>")).show
    +---+--------+
    |col|replaced|
    +---+--------+
    |   | <empty>|
    +---+--------+
    ```
    3.1.2
    ```
    scala> val df = spark.sql("SELECT '' AS col")
    df: org.apache.spark.sql.DataFrame = [col: string]
    
    scala> df.withColumn("replaced", regexp_replace(col("col"), "^$", 
"<empty>")).show
    +---+--------+
    |col|replaced|
    +---+--------+
    |   |        |
    +---+--------+
    ```
    
    The 3.0.2 outcome is the expected and correct one
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes compared to spark 3.2.1, as it brings back the correct behavior when 
trying to regex match empty strings, as shown in the example above.
    
    ### How was this patch tested?
    
    Added special casing test in `RegexpExpressionsSuite.RegexReplace` with 
empty string replacement.
    
    Closes #36457 from LorenzoMartini/lmartini/fix-empty-string-replace.
    
    Authored-by: Lorenzo Martini <lmart...@palantir.com>
    Signed-off-by: Sean Owen <sro...@gmail.com>
    (cherry picked from commit 731aa2cdf8a78835621fbf3de2d3492b27711d1a)
    Signed-off-by: Sean Owen <sro...@gmail.com>
---
 .../org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala | 4 ++--
 .../spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala       | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 6c0de3fc7ef..27013e26bdd 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -537,7 +537,7 @@ case class RegExpReplace(subject: Expression, regexp: 
Expression, rep: Expressio
     }
     val source = s.toString()
     val position = i.asInstanceOf[Int] - 1
-    if (position < source.length) {
+    if (position == 0 || position < source.length) {
       val m = pattern.matcher(source)
       m.region(position, source.length)
       result.delete(0, result.length())
@@ -592,7 +592,7 @@ case class RegExpReplace(subject: Expression, regexp: 
Expression, rep: Expressio
       }
       String $source = $subject.toString();
       int $position = $pos - 1;
-      if ($position < $source.length()) {
+      if ($position == 0 || $position < $source.length()) {
         $classNameStringBuffer $termResult = new $classNameStringBuffer();
         java.util.regex.Matcher $matcher = $termPattern.matcher($source);
         $matcher.region($position, $source.length());
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
index 019857580d0..2ca9ede7742 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala
@@ -293,6 +293,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     val row4 = create_row(null, "(\\d+)", "###")
     val row5 = create_row("100-200", null, "###")
     val row6 = create_row("100-200", "(-)", null)
+    val row7 = create_row("", "^$", "<empty string>")
 
     val s = 's.string.at(0)
     val p = 'p.string.at(1)
@@ -305,6 +306,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     checkEvaluation(expr, null, row4)
     checkEvaluation(expr, null, row5)
     checkEvaluation(expr, null, row6)
+    checkEvaluation(expr, "<empty string>", row7)
     // test position
     val exprWithPos = RegExpReplace(s, p, r, 4)
     checkEvaluation(exprWithPos, "100-num", row1)
@@ -313,6 +315,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     checkEvaluation(exprWithPos, null, row4)
     checkEvaluation(exprWithPos, null, row5)
     checkEvaluation(exprWithPos, null, row6)
+    checkEvaluation(exprWithPos, "", row7)
     val exprWithLargePos = RegExpReplace(s, p, r, 7)
     checkEvaluation(exprWithLargePos, "100-20num", row1)
     checkEvaluation(exprWithLargePos, "100-20###", row2)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to