This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new 0922380 [SPARK-34768][SQL] Respect the default input buffer size in Univocity 0922380 is described below commit 0922380406f667ad11b0795ca63be2a8a21a7266 Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Wed Mar 17 19:55:49 2021 +0900 [SPARK-34768][SQL] Respect the default input buffer size in Univocity ### What changes were proposed in this pull request? This PR proposes to follow Univocity's input buffer. ### Why are the changes needed? - Firstly, it's best to trust their judgement on the default values. Also 128 is too low. - Default values arguably have more test coverage in Univocity. - It will also fix https://github.com/uniVocity/univocity-parsers/issues/449 - ^ is a regression compared to Spark 2.4 ### Does this PR introduce _any_ user-facing change? No. In addition, It fixes a regression. ### How was this patch tested? Manually tested, and added a unit test. Closes #31858 from HyukjinKwon/SPARK-34768. Authored-by: HyukjinKwon <gurwls...@apache.org> Signed-off-by: HyukjinKwon <gurwls...@apache.org> (cherry picked from commit 385f1e8f5de5dcad62554cd75446e98c9380b384) Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- .../scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 3 --- .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 11 +++++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index ec40599..c6a8061 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -166,8 +166,6 @@ class CSVOptions( val quoteAll = getBool("quoteAll", false) - val inputBufferSize = 128 - /** * The max error content length in CSV parser/writer exception message. */ @@ -259,7 +257,6 @@ class CSVOptions( settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) settings.setReadInputOnSeparateThread(false) - settings.setInputBufferSize(inputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 30f0e45..3fe6ce7 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -2452,6 +2452,17 @@ abstract class CSVSuite assert(result.sameElements(exceptResults)) } } + + test("SPARK-34768: counting a long record with ignoreTrailingWhiteSpace set to true") { + val bufSize = 128 + val line = "X" * (bufSize - 1) + "| |" + withTempPath { path => + Seq(line).toDF.write.text(path.getAbsolutePath) + assert(spark.read.format("csv") + .option("delimiter", "|") + .option("ignoreTrailingWhiteSpace", "true").load(path.getAbsolutePath).count() == 1) + } + } } class CSVv1Suite extends CSVSuite { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org