This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 67b6f0ef85d [SPARK-42335][SQL] Pass the comment option through to univocity if users set it explicitly in CSV dataSource 67b6f0ef85d is described below commit 67b6f0ef85d75d33bd460ab76d5137d254684bb7 Author: wayneguow <guo...@gmail.com> AuthorDate: Wed Feb 8 13:12:47 2023 -0800 [SPARK-42335][SQL] Pass the comment option through to univocity if users set it explicitly in CSV dataSource ### What changes were proposed in this pull request? Pass the comment option through to univocity if users set it explicitly in CSV dataSource. ### Why are the changes needed? In #29516 , in order to fix some bugs, univocity-parsers was upgrade from 2.8.3 to 2.9.0, it also involved a new feature of univocity-parsers that quoting values of the first column that start with the comment character. It made a breaking for users downstream that handing a whole row as input. Before this change: #abc,1 After this change: "#abc",1 We change the related `isCommentSet` check logic to enable users to keep behavior as before. ### Does this PR introduce _any_ user-facing change? Yes, a little. If users set comment option as '\u0000' explicitly, now they should remove it to keep comment option unset. ### How was this patch tested? Add a full new test. Closes #39878 from wayneguow/comment. Authored-by: wayneguow <guo...@gmail.com> Signed-off-by: Sean Owen <sro...@gmail.com> --- .../apache/spark/sql/catalyst/csv/CSVOptions.scala | 5 ++- .../sql/execution/datasources/csv/CSVSuite.scala | 47 ++++++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index a66070aa853..81fcffec586 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -222,7 +222,10 @@ class CSVOptions( */ val maxErrorContentLength = 1000 - val isCommentSet = this.comment != '\u0000' + val isCommentSet = parameters.get(COMMENT) match { + case Some(value) if value.length == 1 => true + case _ => false + } val samplingRatio = parameters.get(SAMPLING_RATIO).map(_.toDouble).getOrElse(1.0) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 3fe91b12e15..44f1b2faceb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -3101,6 +3101,53 @@ abstract class CSVSuite } } + test("SPARK-42335: Pass the comment option through to univocity " + + "if users set it explicitly in CSV dataSource") { + withTempPath { path => + Seq("#abc", "\u0000def", "xyz").toDF() + .write.option("comment", "\u0000").csv(path.getCanonicalPath) + checkAnswer( + spark.read.text(path.getCanonicalPath), + Seq(Row("#abc"), Row("\"def\""), Row("xyz")) + ) + } + withTempPath { path => + Seq("#abc", "\u0000def", "xyz").toDF() + .write.option("comment", "#").csv(path.getCanonicalPath) + checkAnswer( + spark.read.text(path.getCanonicalPath), + Seq(Row("\"#abc\""), Row("def"), Row("xyz")) + ) + } + withTempPath { path => + Seq("#abc", "\u0000def", "xyz").toDF() + .write.csv(path.getCanonicalPath) + checkAnswer( + spark.read.text(path.getCanonicalPath), + Seq(Row("\"#abc\""), Row("def"), Row("xyz")) + ) + } + withTempPath { path => + Seq("#abc", "\u0000def", "xyz").toDF().write.text(path.getCanonicalPath) + checkAnswer( + spark.read.option("comment", "\u0000").csv(path.getCanonicalPath), + Seq(Row("#abc"), Row("xyz"))) + } + withTempPath { path => + Seq("#abc", "\u0000def", "xyz").toDF().write.text(path.getCanonicalPath) + checkAnswer( + spark.read.option("comment", "#").csv(path.getCanonicalPath), + Seq(Row("\u0000def"), Row("xyz"))) + } + withTempPath { path => + Seq("#abc", "\u0000def", "xyz").toDF().write.text(path.getCanonicalPath) + checkAnswer( + spark.read.csv(path.getCanonicalPath), + Seq(Row("#abc"), Row("\u0000def"), Row("xyz")) + ) + } + } + test("SPARK-40667: validate CSV Options") { assert(CSVOptions.getAllOptions.size == 38) // Please add validation on any new CSV options here --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org