This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 67b6f0ef85d [SPARK-42335][SQL] Pass the comment option through to 
univocity if users set it explicitly in CSV dataSource
67b6f0ef85d is described below

commit 67b6f0ef85d75d33bd460ab76d5137d254684bb7
Author: wayneguow <guo...@gmail.com>
AuthorDate: Wed Feb 8 13:12:47 2023 -0800

    [SPARK-42335][SQL] Pass the comment option through to univocity if users 
set it explicitly in CSV dataSource
    
    ### What changes were proposed in this pull request?
    Pass the comment option through to univocity if users set it explicitly in 
CSV dataSource.
    
    ### Why are the changes needed?
    In #29516 , in order to fix some bugs, univocity-parsers was upgrade from 
2.8.3 to 2.9.0, it also involved a new feature of univocity-parsers that 
quoting values of the first column that start with the comment character. It 
made a breaking for users downstream that handing a whole row as input.
    Before this change:
    #abc,1
    After this change:
    "#abc",1
    We change the related `isCommentSet` check logic to enable users to keep 
behavior as before.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, a little. If users set comment option as '\u0000' explicitly, now they 
should remove it to keep comment option unset.
    
    ### How was this patch tested?
    Add a full new test.
    
    Closes #39878 from wayneguow/comment.
    
    Authored-by: wayneguow <guo...@gmail.com>
    Signed-off-by: Sean Owen <sro...@gmail.com>
---
 .../apache/spark/sql/catalyst/csv/CSVOptions.scala |  5 ++-
 .../sql/execution/datasources/csv/CSVSuite.scala   | 47 ++++++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index a66070aa853..81fcffec586 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -222,7 +222,10 @@ class CSVOptions(
    */
   val maxErrorContentLength = 1000
 
-  val isCommentSet = this.comment != '\u0000'
+  val isCommentSet = parameters.get(COMMENT) match {
+    case Some(value) if value.length == 1 => true
+    case _ => false
+  }
 
   val samplingRatio =
     parameters.get(SAMPLING_RATIO).map(_.toDouble).getOrElse(1.0)
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 3fe91b12e15..44f1b2faceb 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -3101,6 +3101,53 @@ abstract class CSVSuite
     }
   }
 
+  test("SPARK-42335: Pass the comment option through to univocity " +
+    "if users set it explicitly in CSV dataSource") {
+    withTempPath { path =>
+      Seq("#abc", "\u0000def", "xyz").toDF()
+        .write.option("comment", "\u0000").csv(path.getCanonicalPath)
+      checkAnswer(
+        spark.read.text(path.getCanonicalPath),
+        Seq(Row("#abc"), Row("\"def\""), Row("xyz"))
+      )
+    }
+    withTempPath { path =>
+      Seq("#abc", "\u0000def", "xyz").toDF()
+        .write.option("comment", "#").csv(path.getCanonicalPath)
+      checkAnswer(
+        spark.read.text(path.getCanonicalPath),
+        Seq(Row("\"#abc\""), Row("def"), Row("xyz"))
+      )
+    }
+    withTempPath { path =>
+      Seq("#abc", "\u0000def", "xyz").toDF()
+        .write.csv(path.getCanonicalPath)
+      checkAnswer(
+        spark.read.text(path.getCanonicalPath),
+        Seq(Row("\"#abc\""), Row("def"), Row("xyz"))
+      )
+    }
+    withTempPath { path =>
+      Seq("#abc", "\u0000def", "xyz").toDF().write.text(path.getCanonicalPath)
+      checkAnswer(
+        spark.read.option("comment", "\u0000").csv(path.getCanonicalPath),
+        Seq(Row("#abc"), Row("xyz")))
+    }
+    withTempPath { path =>
+      Seq("#abc", "\u0000def", "xyz").toDF().write.text(path.getCanonicalPath)
+      checkAnswer(
+        spark.read.option("comment", "#").csv(path.getCanonicalPath),
+        Seq(Row("\u0000def"), Row("xyz")))
+    }
+    withTempPath { path =>
+      Seq("#abc", "\u0000def", "xyz").toDF().write.text(path.getCanonicalPath)
+      checkAnswer(
+        spark.read.csv(path.getCanonicalPath),
+        Seq(Row("#abc"), Row("\u0000def"), Row("xyz"))
+      )
+    }
+  }
+
   test("SPARK-40667: validate CSV Options") {
     assert(CSVOptions.getAllOptions.size == 38)
     // Please add validation on any new CSV options here


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to