Repository: spark Updated Branches: refs/heads/master d0ecff285 -> 1e6c1d8bf
[SPARK-25493][SQL] Use auto-detection for CRLF in CSV datasource multiline mode ## What changes were proposed in this pull request? CSVs with windows style crlf ('\r\n') don't work in multiline mode. They work fine in single line mode because the line separation is done by Hadoop, which can handle all the different types of line separators. This PR fixes it by enabling Univocity's line separator detection in multiline mode, which will detect '\r\n', '\r', or '\n' automatically as it is done by hadoop in single line mode. ## How was this patch tested? Unit test with a file with crlf line endings. Closes #22503 from justinuang/fix-clrf-multiline. Authored-by: Justin Uang <ju...@palantir.com> Signed-off-by: hyukjinkwon <gurwls...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1e6c1d8b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1e6c1d8b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1e6c1d8b Branch: refs/heads/master Commit: 1e6c1d8bfb7841596452e25b870823b9a4b267f4 Parents: d0ecff2 Author: Justin Uang <ju...@palantir.com> Authored: Fri Oct 19 11:13:02 2018 +0800 Committer: hyukjinkwon <gurwls...@apache.org> Committed: Fri Oct 19 11:13:02 2018 +0800 ---------------------------------------------------------------------- .../org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 2 ++ sql/core/src/test/resources/test-data/cars-crlf.csv | 7 +++++++ .../spark/sql/execution/datasources/csv/CSVSuite.scala | 12 ++++++++++++ 3 files changed, 21 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/1e6c1d8b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 3e25d82..cdaaa17 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -212,6 +212,8 @@ class CSVOptions( settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) + settings.setLineSeparatorDetectionEnabled(multiLine == true) + settings } } http://git-wip-us.apache.org/repos/asf/spark/blob/1e6c1d8b/sql/core/src/test/resources/test-data/cars-crlf.csv ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/test-data/cars-crlf.csv b/sql/core/src/test/resources/test-data/cars-crlf.csv new file mode 100644 index 0000000..d018d08 --- /dev/null +++ b/sql/core/src/test/resources/test-data/cars-crlf.csv @@ -0,0 +1,7 @@ + +year,make,model,comment,blank +"2012","Tesla","S","No comment", + +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt + http://git-wip-us.apache.org/repos/asf/spark/blob/1e6c1d8b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index d59035b..d43efc8 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -52,6 +52,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te private val carsNullFile = "test-data/cars-null.csv" private val carsEmptyValueFile = "test-data/cars-empty-value.csv" private val carsBlankColName = "test-data/cars-blank-column-name.csv" + private val carsCrlf = "test-data/cars-crlf.csv" private val emptyFile = "test-data/empty.csv" private val commentsFile = "test-data/comments.csv" private val disableCommentsFile = "test-data/disable_comments.csv" @@ -220,6 +221,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te } } + test("crlf line separators in multiline mode") { + val cars = spark + .read + .format("csv") + .option("multiLine", "true") + .option("header", "true") + .load(testFile(carsCrlf)) + + verifyCars(cars, withHeader = true) + } + test("test aliases sep and encoding for delimiter and charset") { // scalastyle:off val cars = spark --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org