Repository: spark Updated Branches: refs/heads/branch-2.4 bb211cf27 -> 1a335444e
[SPARK-25660][SQL] Fix for the backward slash as CSV fields delimiter ## What changes were proposed in this pull request? The PR addresses the exception raised on accessing chars out of delimiter string. In particular, the backward slash `\` as the CSV fields delimiter causes the following exception on reading `abc\1`: ```Scala String index out of range: 1 java.lang.StringIndexOutOfBoundsException: String index out of range: 1 at java.lang.String.charAt(String.java:658) ``` because `str.charAt(1)` tries to access a char out of `str` in `CSVUtils.toChar` ## How was this patch tested? Added tests for empty string and string containing the backward slash to `CSVUtilsSuite`. Besides of that I added an end-to-end test to check how the backward slash is handled in reading CSV string with it. Closes #22654 from MaxGekk/csv-slash-delim. Authored-by: Maxim Gekk <maxim.g...@databricks.com> Signed-off-by: gatorsmile <gatorsm...@gmail.com> (cherry picked from commit c7eadb5e6652468f9d5cd714c112ba1de187eea8) Signed-off-by: gatorsmile <gatorsm...@gmail.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1a335444 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1a335444 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1a335444 Branch: refs/heads/branch-2.4 Commit: 1a335444e6ba4124bd0f7f351f097c0bdb46ae85 Parents: bb211cf Author: Maxim Gekk <maxim.g...@databricks.com> Authored: Fri Oct 12 12:04:00 2018 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Fri Oct 12 12:04:16 2018 -0700 ---------------------------------------------------------------------- .../execution/datasources/csv/CSVUtils.scala | 36 +++++++++++--------- .../execution/datasources/csv/CSVSuite.scala | 10 ++++++ .../datasources/csv/CSVUtilsSuite.scala | 14 ++++++++ 3 files changed, 43 insertions(+), 17 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/1a335444/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 7ce65fa..b367b3d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -97,23 +97,25 @@ object CSVUtils { */ @throws[IllegalArgumentException] def toChar(str: String): Char = { - if (str.charAt(0) == '\\') { - str.charAt(1) - match { - case 't' => '\t' - case 'r' => '\r' - case 'b' => '\b' - case 'f' => '\f' - case '\"' => '\"' // In case user changes quote char and uses \" as delimiter in options - case '\'' => '\'' - case 'u' if str == """\u0000""" => '\u0000' - case _ => - throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") - } - } else if (str.length == 1) { - str.charAt(0) - } else { - throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str") + (str: Seq[Char]) match { + case Seq() => throw new IllegalArgumentException("Delimiter cannot be empty string") + case Seq('\\') => throw new IllegalArgumentException("Single backslash is prohibited." + + " It has special meaning as beginning of an escape sequence." + + " To get the backslash character, pass a string with two backslashes as the delimiter.") + case Seq(c) => c + case Seq('\\', 't') => '\t' + case Seq('\\', 'r') => '\r' + case Seq('\\', 'b') => '\b' + case Seq('\\', 'f') => '\f' + // In case user changes quote char and uses \" as delimiter in options + case Seq('\\', '\"') => '\"' + case Seq('\\', '\'') => '\'' + case Seq('\\', '\\') => '\\' + case _ if str == """\u0000""" => '\u0000' + case Seq('\\', _) => + throw new IllegalArgumentException(s"Unsupported special character for delimiter: $str") + case _ => + throw new IllegalArgumentException(s"Delimiter cannot be more than one character: $str") } } http://git-wip-us.apache.org/repos/asf/spark/blob/1a335444/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 5d4746c..d59035b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1826,4 +1826,14 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te val df = spark.read.option("enforceSchema", false).csv(input) checkAnswer(df, Row("1", "2")) } + + test("using the backward slash as the delimiter") { + val input = Seq("""abc\1""").toDS() + val delimiter = """\\""" + checkAnswer(spark.read.option("delimiter", delimiter).csv(input), Row("abc", "1")) + checkAnswer(spark.read.option("inferSchema", true).option("delimiter", delimiter).csv(input), + Row("abc", 1)) + val schema = new StructType().add("a", StringType).add("b", IntegerType) + checkAnswer(spark.read.schema(schema).option("delimiter", delimiter).csv(input), Row("abc", 1)) + } } http://git-wip-us.apache.org/repos/asf/spark/blob/1a335444/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala index 221e44c..60fcbd2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtilsSuite.scala @@ -28,6 +28,7 @@ class CSVUtilsSuite extends SparkFunSuite { assert(CSVUtils.toChar("""\"""") === '\"') assert(CSVUtils.toChar("""\'""") === '\'') assert(CSVUtils.toChar("""\u0000""") === '\u0000') + assert(CSVUtils.toChar("""\\""") === '\\') } test("Does not accept delimiter larger than one character") { @@ -44,4 +45,17 @@ class CSVUtilsSuite extends SparkFunSuite { assert(exception.getMessage.contains("Unsupported special character for delimiter")) } + test("string with one backward slash is prohibited") { + val exception = intercept[IllegalArgumentException]{ + CSVUtils.toChar("""\""") + } + assert(exception.getMessage.contains("Single backslash is prohibited")) + } + + test("output proper error message for empty string") { + val exception = intercept[IllegalArgumentException]{ + CSVUtils.toChar("") + } + assert(exception.getMessage.contains("Delimiter cannot be empty string")) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org