Repository: spark Updated Branches: refs/heads/master 628bdeabd -> 51620e288
[SPARK-21756][SQL] Add JSON option to allow unquoted control characters ## What changes were proposed in this pull request? This patch adds allowUnquotedControlChars option in JSON data source to allow JSON Strings to contain unquoted control characters (ASCII characters with value less than 32, including tab and line feed characters) ## How was this patch tested? Add new test cases Author: vinodkc <vinod.kc...@gmail.com> Closes #19008 from vinodkc/br_fix_SPARK-21756. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/51620e28 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/51620e28 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/51620e28 Branch: refs/heads/master Commit: 51620e288b5e0a7fffc3899c9deadabace28e6d7 Parents: 628bdea Author: vinodkc <vinod.kc...@gmail.com> Authored: Fri Aug 25 10:18:03 2017 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Fri Aug 25 10:18:03 2017 -0700 ---------------------------------------------------------------------- python/pyspark/sql/readwriter.py | 8 ++++++-- python/pyspark/sql/streaming.py | 8 ++++++-- .../apache/spark/sql/catalyst/json/JSONOptions.scala | 3 +++ .../scala/org/apache/spark/sql/DataFrameReader.scala | 3 +++ .../spark/sql/streaming/DataStreamReader.scala | 3 +++ .../datasources/json/JsonParsingOptionsSuite.scala | 15 +++++++++++++++ 6 files changed, 36 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/51620e28/python/pyspark/sql/readwriter.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 7279173..01da0dc 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -176,7 +176,7 @@ class DataFrameReader(OptionUtils): allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None, allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None, - multiLine=None): + multiLine=None, allowUnquotedControlChars=None): """ Loads JSON files and returns the results as a :class:`DataFrame`. @@ -234,6 +234,9 @@ class DataFrameReader(OptionUtils): default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. :param multiLine: parse one record, which may span multiple lines, per file. If None is set, it uses the default value, ``false``. + :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control + characters (ASCII characters with value less than 32, + including tab and line feed characters) or not. >>> df1 = spark.read.json('python/test_support/sql/people.json') >>> df1.dtypes @@ -250,7 +253,8 @@ class DataFrameReader(OptionUtils): allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, - timestampFormat=timestampFormat, multiLine=multiLine) + timestampFormat=timestampFormat, multiLine=multiLine, + allowUnquotedControlChars=allowUnquotedControlChars) if isinstance(path, basestring): path = [path] if type(path) == list: http://git-wip-us.apache.org/repos/asf/spark/blob/51620e28/python/pyspark/sql/streaming.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 5bbd70c..0cf7021 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -407,7 +407,7 @@ class DataStreamReader(OptionUtils): allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None, allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None, - multiLine=None): + multiLine=None, allowUnquotedControlChars=None): """ Loads a JSON file stream and returns the results as a :class:`DataFrame`. @@ -467,6 +467,9 @@ class DataStreamReader(OptionUtils): default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. :param multiLine: parse one record, which may span multiple lines, per file. If None is set, it uses the default value, ``false``. + :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control + characters (ASCII characters with value less than 32, + including tab and line feed characters) or not. >>> json_sdf = spark.readStream.json(tempfile.mkdtemp(), schema = sdf_schema) >>> json_sdf.isStreaming @@ -480,7 +483,8 @@ class DataStreamReader(OptionUtils): allowSingleQuotes=allowSingleQuotes, allowNumericLeadingZero=allowNumericLeadingZero, allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, - timestampFormat=timestampFormat, multiLine=multiLine) + timestampFormat=timestampFormat, multiLine=multiLine, + allowUnquotedControlChars=allowUnquotedControlChars) if isinstance(path, basestring): return self._df(self._jreader.json(path)) else: http://git-wip-us.apache.org/repos/asf/spark/blob/51620e28/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index 1fd680a..652412b 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -64,6 +64,8 @@ private[sql] class JSONOptions( parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true) val allowBackslashEscapingAnyCharacter = parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false) + private val allowUnquotedControlChars = + parameters.get("allowUnquotedControlChars").map(_.toBoolean).getOrElse(false) val compressionCodec = parameters.get("compression").map(CompressionCodecs.getCodecClassName) val parseMode: ParseMode = parameters.get("mode").map(ParseMode.fromString).getOrElse(PermissiveMode) @@ -92,5 +94,6 @@ private[sql] class JSONOptions( factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers) factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, allowBackslashEscapingAnyCharacter) + factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, allowUnquotedControlChars) } } http://git-wip-us.apache.org/repos/asf/spark/blob/51620e28/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 41cb019..8209cec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -313,6 +313,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * (e.g. 00012)</li> * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all * character using backslash quoting mechanism</li> + * <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted + * control characters (ASCII characters with value less than 32, including tab and line feed + * characters) or not.</li> * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. * <ul> http://git-wip-us.apache.org/repos/asf/spark/blob/51620e28/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala index 70ddfa8..a42e280 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala @@ -195,6 +195,9 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo * (e.g. 00012)</li> * <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all * character using backslash quoting mechanism</li> + * <li>`allowUnquotedControlChars` (default `false`): allows JSON Strings to contain unquoted + * control characters (ASCII characters with value less than 32, including tab and line feed + * characters) or not.</li> * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records * during parsing. * <ul> http://git-wip-us.apache.org/repos/asf/spark/blob/51620e28/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala index 6e2b4f0..316c518 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala @@ -72,6 +72,21 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext { assert(df.first().getString(0) == "Reynold Xin") } + test("allowUnquotedControlChars off") { + val str = """{"name": "a\u0001b"}""" + val df = spark.read.json(Seq(str).toDS()) + + assert(df.schema.head.name == "_corrupt_record") + } + + test("allowUnquotedControlChars on") { + val str = """{"name": "a\u0001b"}""" + val df = spark.read.option("allowUnquotedControlChars", "true").json(Seq(str).toDS()) + + assert(df.schema.head.name == "name") + assert(df.first().getString(0) == "a\u0001b") + } + test("allowNumericLeadingZeros off") { val str = """{"age": 0018}""" val df = spark.read.json(Seq(str).toDS()) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org