spark git commit: [SPARK-16515][SQL] set default record reader and writer for script transformation

2016-07-18 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/branch-2.0 085f3cc85 -> 33d92f7f3


[SPARK-16515][SQL] set default record reader and writer for script 
transformation

## What changes were proposed in this pull request?
In ScriptInputOutputSchema, we read default RecordReader and RecordWriter from 
conf. Since Spark 2.0 has deleted those config keys from hive conf, we have to 
set default reader/writer class name by ourselves. Otherwise we will get None 
for LazySimpleSerde, the data written would not be able to read by script. The 
test case added worked fine with previous version of Spark, but would fail now.

## How was this patch tested?
added a test case in SQLQuerySuite.

Closes #14169

Author: Daoyuan Wang 
Author: Yin Huai 

Closes #14249 from yhuai/scriptTransformation.

(cherry picked from commit 96e9afaae93318250334211cc80ed0fee3d055b9)
Signed-off-by: Yin Huai 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33d92f7f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33d92f7f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33d92f7f

Branch: refs/heads/branch-2.0
Commit: 33d92f7f39136bd399e1f7cabd264e7eeca9b958
Parents: 085f3cc
Author: Daoyuan Wang 
Authored: Mon Jul 18 13:58:12 2016 -0700
Committer: Yin Huai 
Committed: Mon Jul 18 13:58:56 2016 -0700

--
 .../spark/sql/execution/SparkSqlParser.scala| 16 +-
 sql/hive/src/test/resources/test_script.sh  | 23 
 .../sql/hive/execution/SQLQuerySuite.scala  | 11 ++
 3 files changed, 45 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/33d92f7f/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index 42ec210..3573a86 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -1315,7 +1315,10 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
 
 // Decode and input/output format.
 type Format = (Seq[(String, String)], Option[String], Seq[(String, 
String)], Option[String])
-def format(fmt: RowFormatContext, configKey: String): Format = fmt match {
+def format(
+fmt: RowFormatContext,
+configKey: String,
+defaultConfigValue: String): Format = fmt match {
   case c: RowFormatDelimitedContext =>
 // TODO we should use the visitRowFormatDelimited function here. 
However HiveScriptIOSchema
 // expects a seq of pairs in which the old parsers' token names are 
used as keys.
@@ -1338,7 +1341,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
 
 // SPARK-10310: Special cases LazySimpleSerDe
 val recordHandler = if (name == 
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") {
-  Try(conf.getConfString(configKey)).toOption
+  Option(conf.getConfString(configKey, defaultConfigValue))
 } else {
   None
 }
@@ -1349,15 +1352,18 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
 val name = conf.getConfString("hive.script.serde",
   "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
 val props = Seq("field.delim" -> "\t")
-val recordHandler = Try(conf.getConfString(configKey)).toOption
+val recordHandler = Option(conf.getConfString(configKey, 
defaultConfigValue))
 (Nil, Option(name), props, recordHandler)
 }
 
 val (inFormat, inSerdeClass, inSerdeProps, reader) =
-  format(inRowFormat, "hive.script.recordreader")
+  format(
+inRowFormat, "hive.script.recordreader", 
"org.apache.hadoop.hive.ql.exec.TextRecordReader")
 
 val (outFormat, outSerdeClass, outSerdeProps, writer) =
-  format(outRowFormat, "hive.script.recordwriter")
+  format(
+outRowFormat, "hive.script.recordwriter",
+"org.apache.hadoop.hive.ql.exec.TextRecordWriter")
 
 ScriptInputOutputSchema(
   inFormat, outFormat,

http://git-wip-us.apache.org/repos/asf/spark/blob/33d92f7f/sql/hive/src/test/resources/test_script.sh
--
diff --git a/sql/hive/src/test/resources/test_script.sh 
b/sql/hive/src/test/resources/test_script.sh
new file mode 100755
index 000..ab998c4
--- /dev/null
+++ b/sql/hive/src/test/resources/test_script.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# 

spark git commit: [SPARK-16515][SQL] set default record reader and writer for script transformation

2016-07-18 Thread yhuai
Repository: spark
Updated Branches:
  refs/heads/master 2877f1a52 -> 96e9afaae


[SPARK-16515][SQL] set default record reader and writer for script 
transformation

## What changes were proposed in this pull request?
In ScriptInputOutputSchema, we read default RecordReader and RecordWriter from 
conf. Since Spark 2.0 has deleted those config keys from hive conf, we have to 
set default reader/writer class name by ourselves. Otherwise we will get None 
for LazySimpleSerde, the data written would not be able to read by script. The 
test case added worked fine with previous version of Spark, but would fail now.

## How was this patch tested?
added a test case in SQLQuerySuite.

Closes #14169

Author: Daoyuan Wang 
Author: Yin Huai 

Closes #14249 from yhuai/scriptTransformation.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/96e9afaa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/96e9afaa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/96e9afaa

Branch: refs/heads/master
Commit: 96e9afaae93318250334211cc80ed0fee3d055b9
Parents: 2877f1a
Author: Daoyuan Wang 
Authored: Mon Jul 18 13:58:12 2016 -0700
Committer: Yin Huai 
Committed: Mon Jul 18 13:58:12 2016 -0700

--
 .../spark/sql/execution/SparkSqlParser.scala| 16 +-
 sql/hive/src/test/resources/test_script.sh  | 23 
 .../sql/hive/execution/SQLQuerySuite.scala  | 11 ++
 3 files changed, 45 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/96e9afaa/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
index c5f4d58..fa4ccf4 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/SparkSqlParser.scala
@@ -1325,7 +1325,10 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
 
 // Decode and input/output format.
 type Format = (Seq[(String, String)], Option[String], Seq[(String, 
String)], Option[String])
-def format(fmt: RowFormatContext, configKey: String): Format = fmt match {
+def format(
+fmt: RowFormatContext,
+configKey: String,
+defaultConfigValue: String): Format = fmt match {
   case c: RowFormatDelimitedContext =>
 // TODO we should use the visitRowFormatDelimited function here. 
However HiveScriptIOSchema
 // expects a seq of pairs in which the old parsers' token names are 
used as keys.
@@ -1348,7 +1351,7 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
 
 // SPARK-10310: Special cases LazySimpleSerDe
 val recordHandler = if (name == 
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe") {
-  Try(conf.getConfString(configKey)).toOption
+  Option(conf.getConfString(configKey, defaultConfigValue))
 } else {
   None
 }
@@ -1359,15 +1362,18 @@ class SparkSqlAstBuilder(conf: SQLConf) extends 
AstBuilder {
 val name = conf.getConfString("hive.script.serde",
   "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
 val props = Seq("field.delim" -> "\t")
-val recordHandler = Try(conf.getConfString(configKey)).toOption
+val recordHandler = Option(conf.getConfString(configKey, 
defaultConfigValue))
 (Nil, Option(name), props, recordHandler)
 }
 
 val (inFormat, inSerdeClass, inSerdeProps, reader) =
-  format(inRowFormat, "hive.script.recordreader")
+  format(
+inRowFormat, "hive.script.recordreader", 
"org.apache.hadoop.hive.ql.exec.TextRecordReader")
 
 val (outFormat, outSerdeClass, outSerdeProps, writer) =
-  format(outRowFormat, "hive.script.recordwriter")
+  format(
+outRowFormat, "hive.script.recordwriter",
+"org.apache.hadoop.hive.ql.exec.TextRecordWriter")
 
 ScriptInputOutputSchema(
   inFormat, outFormat,

http://git-wip-us.apache.org/repos/asf/spark/blob/96e9afaa/sql/hive/src/test/resources/test_script.sh
--
diff --git a/sql/hive/src/test/resources/test_script.sh 
b/sql/hive/src/test/resources/test_script.sh
new file mode 100755
index 000..ab998c4
--- /dev/null
+++ b/sql/hive/src/test/resources/test_script.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file