This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new bd03050 [SPARK-35045][SQL] Add an internal option to control input buffer in univocity bd03050 is described below commit bd03050ff7493e9f0331de996bdafcde993c5fc3 Author: Hyukjin Kwon <gurwls...@apache.org> AuthorDate: Tue Apr 13 15:08:01 2021 +0300 [SPARK-35045][SQL] Add an internal option to control input buffer in univocity ### What changes were proposed in this pull request? This PR makes the input buffer configurable (as an internal option). This is mainly to work around uniVocity/univocity-parsers#449. ### Why are the changes needed? To work around uniVocity/univocity-parsers#449. ### Does this PR introduce _any_ user-facing change? No, it's only internal option. ### How was this patch tested? Manually tested by modifying the unittest added in https://github.com/apache/spark/pull/31858 as below: ```diff diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index fd25a79619d..b58f0bd3661 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala -2460,6 +2460,7 abstract class CSVSuite Seq(line).toDF.write.text(path.getAbsolutePath) assert(spark.read.format("csv") .option("delimiter", "|") + .option("inputBufferSize", "128") .option("ignoreTrailingWhiteSpace", "true").load(path.getAbsolutePath).count() == 1) } } ``` Closes #32145 from HyukjinKwon/SPARK-35045. Lead-authored-by: Hyukjin Kwon <gurwls...@apache.org> Co-authored-by: HyukjinKwon <gurwls...@apache.org> Signed-off-by: Max Gekk <max.g...@gmail.com> (cherry picked from commit 1f562159bf61dd5e536db7841b16e74a635e7a97) Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index c6a8061..2e5539a 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -211,6 +211,8 @@ class CSVOptions( } val lineSeparatorInWrite: Option[String] = lineSeparator + val inputBufferSize: Option[Int] = parameters.get("inputBufferSize").map(_.toInt) + /** * The handling method to be used when unescaped quotes are found in the input. */ @@ -257,6 +259,7 @@ class CSVOptions( settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) settings.setReadInputOnSeparateThread(false) + inputBufferSize.foreach(settings.setInputBufferSize) settings.setMaxColumns(maxColumns) settings.setNullValue(nullValue) settings.setEmptyValue(emptyValueInRead) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org