date:20160622

spark git commit: [SPARK-16162] Remove dead code OrcTableScan.

2016-06-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master f34b5c62b -> 4374a46bf


[SPARK-16162] Remove dead code OrcTableScan.

## What changes were proposed in this pull request?

SPARK-14535 removed all calls to class OrcTableScan. This removes the dead code.

## How was this patch tested?

Existing unit tests.

Author: Brian Cho 

Closes #13869 from dafrista/clean-up-orctablescan.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4374a46b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4374a46b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4374a46b

Branch: refs/heads/master
Commit: 4374a46bfc52ee4f3ae9f61ccedc77a62aa9d4ee
Parents: f34b5c6
Author: Brian Cho 
Authored: Wed Jun 22 22:37:50 2016 -0700
Committer: Reynold Xin 
Committed: Wed Jun 22 22:37:50 2016 -0700

--
 .../spark/sql/hive/orc/OrcFileFormat.scala  | 67 +---
 1 file changed, 1 insertion(+), 66 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4374a46b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
index a2c8092..5de3507 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -27,12 +27,10 @@ import org.apache.hadoop.hive.ql.io.orc._
 import 
org.apache.hadoop.hive.serde2.objectinspector.{SettableStructObjectInspector, 
StructObjectInspector}
 import org.apache.hadoop.hive.serde2.typeinfo.{StructTypeInfo, TypeInfoUtils}
 import org.apache.hadoop.io.{NullWritable, Writable}
-import org.apache.hadoop.mapred.{InputFormat => MapRedInputFormat, JobConf, 
OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
+import org.apache.hadoop.mapred.{JobConf, OutputFormat => MapRedOutputFormat, 
RecordWriter, Reporter}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
 
-import org.apache.spark.internal.Logging
-import org.apache.spark.rdd.{HadoopRDD, RDD}
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions._
@@ -260,69 +258,6 @@ private[orc] class OrcOutputWriter(
   }
 }
 
-private[orc] case class OrcTableScan(
-@transient sparkSession: SparkSession,
-attributes: Seq[Attribute],
-filters: Array[Filter],
-@transient inputPaths: Seq[FileStatus])
-  extends Logging
-  with HiveInspectors {
-
-  def execute(): RDD[InternalRow] = {
-val job = Job.getInstance(sparkSession.sessionState.newHadoopConf())
-val conf = job.getConfiguration
-
-// Figure out the actual schema from the ORC source (without partition 
columns) so that we
-// can pick the correct ordinals.  Note that this assumes that all files 
have the same schema.
-val orcFormat = new OrcFileFormat
-val dataSchema =
-  orcFormat
-.inferSchema(sparkSession, Map.empty, inputPaths)
-.getOrElse(sys.error("Failed to read schema from target ORC files."))
-
-// Tries to push down filters if ORC filter push-down is enabled
-if (sparkSession.sessionState.conf.orcFilterPushDown) {
-  OrcFilters.createFilter(dataSchema, filters).foreach { f =>
-conf.set(OrcTableScan.SARG_PUSHDOWN, f.toKryo)
-conf.setBoolean(ConfVars.HIVEOPTINDEXFILTER.varname, true)
-  }
-}
-
-// Sets requested columns
-OrcRelation.setRequiredColumns(conf, dataSchema, 
StructType.fromAttributes(attributes))
-
-if (inputPaths.isEmpty) {
-  // the input path probably be pruned, return an empty RDD.
-  return sparkSession.sparkContext.emptyRDD[InternalRow]
-}
-FileInputFormat.setInputPaths(job, inputPaths.map(_.getPath): _*)
-
-val inputFormatClass =
-  classOf[OrcInputFormat]
-.asInstanceOf[Class[_ <: MapRedInputFormat[NullWritable, Writable]]]
-
-val rdd = sparkSession.sparkContext.hadoopRDD(
-  conf.asInstanceOf[JobConf],
-  inputFormatClass,
-  classOf[NullWritable],
-  classOf[Writable]
-).asInstanceOf[HadoopRDD[NullWritable, Writable]]
-
-val wrappedConf = new SerializableConfiguration(conf)
-
-rdd.mapPartitionsWithInputSplit { case (split: OrcSplit, iterator) =>
-  val writableIterator = iterator.map(_._2)
-  val maybeStructOI = 
OrcFileOperator.getObjectInspector(split.getPath.toString, Some(conf))
-  OrcRelation.unwrapOrcStructs(
-wrappedConf.value,
-StructType.fromAttributes(attributes),
-maybeStructOI,
-writableIterator
-  )
-}
-

spark git commit: [SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output

2016-06-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5b4a9a4c3 -> 4ad731ed6


[SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output

## What changes were proposed in this pull request?

This PR fixes two minor formatting issues appearing in `SHOW CREATE TABLE` 
output.

Before:

```
CREATE EXTERNAL TABLE ...
...
WITH SERDEPROPERTIES ('serialization.format' = '1'
)
...
TBLPROPERTIES ('avro.schema.url' = '/tmp/avro/test.avsc',
  'transient_lastDdlTime' = '1466638180')
```

After:

```
CREATE EXTERNAL TABLE ...
...
WITH SERDEPROPERTIES (
  'serialization.format' = '1'
)
...
TBLPROPERTIES (
  'avro.schema.url' = '/tmp/avro/test.avsc',
  'transient_lastDdlTime' = '1466638180'
)
```

## How was this patch tested?

Manually tested.

Author: Cheng Lian 

Closes #13864 from liancheng/show-create-table-format-fix.

(cherry picked from commit f34b5c62b2da3fe0ea989acea46fff949d349afc)
Signed-off-by: Reynold Xin 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ad731ed
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ad731ed
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ad731ed

Branch: refs/heads/branch-2.0
Commit: 4ad731ed6a963131f05c387c2f9536b56d228090
Parents: 5b4a9a4
Author: Cheng Lian 
Authored: Wed Jun 22 22:28:54 2016 -0700
Committer: Reynold Xin 
Committed: Wed Jun 22 22:29:00 2016 -0700

--
 .../scala/org/apache/spark/sql/execution/command/tables.scala| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4ad731ed/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 3eb93a2..30dc7e8 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -830,7 +830,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) 
extends RunnableComman
   s"'${escapeSingleQuotedString(key)}' = 
'${escapeSingleQuotedString(value)}'"
   }
 
-  builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (", ",\n  ", 
"\n)\n")
+  builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (\n  ", ",\n  ", 
"\n)\n")
 }
 
 if (storage.inputFormat.isDefined || storage.outputFormat.isDefined) {
@@ -864,7 +864,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) 
extends RunnableComman
   }
 
   if (props.nonEmpty) {
-builder ++= props.mkString("TBLPROPERTIES (", ",\n  ", ")\n")
+builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
   }
 }
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output

2016-06-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 925884a61 -> f34b5c62b


[SQL][MINOR] Fix minor formatting issues in SHOW CREATE TABLE output

## What changes were proposed in this pull request?

This PR fixes two minor formatting issues appearing in `SHOW CREATE TABLE` 
output.

Before:

```
CREATE EXTERNAL TABLE ...
...
WITH SERDEPROPERTIES ('serialization.format' = '1'
)
...
TBLPROPERTIES ('avro.schema.url' = '/tmp/avro/test.avsc',
  'transient_lastDdlTime' = '1466638180')
```

After:

```
CREATE EXTERNAL TABLE ...
...
WITH SERDEPROPERTIES (
  'serialization.format' = '1'
)
...
TBLPROPERTIES (
  'avro.schema.url' = '/tmp/avro/test.avsc',
  'transient_lastDdlTime' = '1466638180'
)
```

## How was this patch tested?

Manually tested.

Author: Cheng Lian 

Closes #13864 from liancheng/show-create-table-format-fix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f34b5c62
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f34b5c62
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f34b5c62

Branch: refs/heads/master
Commit: f34b5c62b2da3fe0ea989acea46fff949d349afc
Parents: 925884a
Author: Cheng Lian 
Authored: Wed Jun 22 22:28:54 2016 -0700
Committer: Reynold Xin 
Committed: Wed Jun 22 22:28:54 2016 -0700

--
 .../scala/org/apache/spark/sql/execution/command/tables.scala| 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f34b5c62/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
index 3eb93a2..30dc7e8 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/tables.scala
@@ -830,7 +830,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) 
extends RunnableComman
   s"'${escapeSingleQuotedString(key)}' = 
'${escapeSingleQuotedString(value)}'"
   }
 
-  builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (", ",\n  ", 
"\n)\n")
+  builder ++= serdeProps.mkString("WITH SERDEPROPERTIES (\n  ", ",\n  ", 
"\n)\n")
 }
 
 if (storage.inputFormat.isDefined || storage.outputFormat.isDefined) {
@@ -864,7 +864,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) 
extends RunnableComman
   }
 
   if (props.nonEmpty) {
-builder ++= props.mkString("TBLPROPERTIES (", ",\n  ", ")\n")
+builder ++= props.mkString("TBLPROPERTIES (\n  ", ",\n  ", "\n)\n")
   }
 }
   }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15230][SQL] distinct() does not handle column name with dot properly

2016-06-22 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/master 37f3be5d2 -> 925884a61


[SPARK-15230][SQL] distinct() does not handle column name with dot properly

## What changes were proposed in this pull request?

When table is created with column name containing dot, distinct() will fail to 
run. For example,
```scala
val rowRDD = sparkContext.parallelize(Seq(Row(1), Row(1), Row(2)))
val schema = StructType(Array(StructField("column.with.dot", IntegerType, 
nullable = false)))
val df = spark.createDataFrame(rowRDD, schema)
```
running the following will have no problem:
```scala
df.select(new Column("`column.with.dot`"))
```
but running the query with additional distinct() will cause exception:
```scala
df.select(new Column("`column.with.dot`")).distinct()
```

The issue is that distinct() will try to resolve the column name, but the 
column name in the schema does not have backtick with it. So the solution is to 
add the backtick before passing the column name to resolve().

## How was this patch tested?

Added a new test case.

Author: bomeng 

Closes #13140 from bomeng/SPARK-15230.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/925884a6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/925884a6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/925884a6

Branch: refs/heads/master
Commit: 925884a612dd88beaddf555c74d90856ab040ec7
Parents: 37f3be5
Author: bomeng 
Authored: Thu Jun 23 11:06:19 2016 +0800
Committer: Wenchen Fan 
Committed: Thu Jun 23 11:06:19 2016 +0800

--
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala   | 8 +++-
 .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 5 +
 2 files changed, 12 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/925884a6/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 02cc398..f1d33c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1812,7 +1812,13 @@ class Dataset[T] private[sql](
* @since 2.0.0
*/
   def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan {
-val groupCols = colNames.map(resolve)
+val resolver = sparkSession.sessionState.analyzer.resolver
+val allColumns = queryExecution.analyzed.output
+val groupCols = colNames.map { colName =>
+  allColumns.find(col => resolver(col.name, colName)).getOrElse(
+throw new AnalysisException(
+  s"""Cannot resolve column name "$colName" among 
(${schema.fieldNames.mkString(", ")})"""))
+}
 val groupColExprIds = groupCols.map(_.exprId)
 val aggCols = logicalPlan.output.map { attr =>
   if (groupColExprIds.contains(attr.exprId)) {

http://git-wip-us.apache.org/repos/asf/spark/blob/925884a6/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c8a0f71..1afee9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1536,4 +1536,9 @@ class DataFrameSuite extends QueryTest with 
SharedSQLContext {
   Utils.deleteRecursively(baseDir)
 }
   }
+
+  test("SPARK-15230: distinct() does not handle column name with dot 
properly") {
+val df = Seq(1, 1, 2).toDF("column.with.dot")
+checkAnswer(df.distinct(), Row(1) :: Row(2) :: Nil)
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15230][SQL] distinct() does not handle column name with dot properly

2016-06-22 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e0a43235d -> 5b4a9a4c3


[SPARK-15230][SQL] distinct() does not handle column name with dot properly

## What changes were proposed in this pull request?

When table is created with column name containing dot, distinct() will fail to 
run. For example,
```scala
val rowRDD = sparkContext.parallelize(Seq(Row(1), Row(1), Row(2)))
val schema = StructType(Array(StructField("column.with.dot", IntegerType, 
nullable = false)))
val df = spark.createDataFrame(rowRDD, schema)
```
running the following will have no problem:
```scala
df.select(new Column("`column.with.dot`"))
```
but running the query with additional distinct() will cause exception:
```scala
df.select(new Column("`column.with.dot`")).distinct()
```

The issue is that distinct() will try to resolve the column name, but the 
column name in the schema does not have backtick with it. So the solution is to 
add the backtick before passing the column name to resolve().

## How was this patch tested?

Added a new test case.

Author: bomeng 

Closes #13140 from bomeng/SPARK-15230.

(cherry picked from commit 925884a612dd88beaddf555c74d90856ab040ec7)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5b4a9a4c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5b4a9a4c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5b4a9a4c

Branch: refs/heads/branch-2.0
Commit: 5b4a9a4c37822cd7528c6bb933da3454fd3bcd37
Parents: e0a4323
Author: bomeng 
Authored: Thu Jun 23 11:06:19 2016 +0800
Committer: Wenchen Fan 
Committed: Thu Jun 23 11:06:38 2016 +0800

--
 sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala   | 8 +++-
 .../src/test/scala/org/apache/spark/sql/DataFrameSuite.scala | 5 +
 2 files changed, 12 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5b4a9a4c/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
--
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 02cc398..f1d33c3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -1812,7 +1812,13 @@ class Dataset[T] private[sql](
* @since 2.0.0
*/
   def dropDuplicates(colNames: Seq[String]): Dataset[T] = withTypedPlan {
-val groupCols = colNames.map(resolve)
+val resolver = sparkSession.sessionState.analyzer.resolver
+val allColumns = queryExecution.analyzed.output
+val groupCols = colNames.map { colName =>
+  allColumns.find(col => resolver(col.name, colName)).getOrElse(
+throw new AnalysisException(
+  s"""Cannot resolve column name "$colName" among 
(${schema.fieldNames.mkString(", ")})"""))
+}
 val groupColExprIds = groupCols.map(_.exprId)
 val aggCols = logicalPlan.output.map { attr =>
   if (groupColExprIds.contains(attr.exprId)) {

http://git-wip-us.apache.org/repos/asf/spark/blob/5b4a9a4c/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
--
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
index c8a0f71..1afee9f 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
@@ -1536,4 +1536,9 @@ class DataFrameSuite extends QueryTest with 
SharedSQLContext {
   Utils.deleteRecursively(baseDir)
 }
   }
+
+  test("SPARK-15230: distinct() does not handle column name with dot 
properly") {
+val df = Seq(1, 1, 2).toDF("column.with.dot")
+checkAnswer(df.distinct(), Row(1) :: Row(2) :: Nil)
+  }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16159][SQL] Move RDD creation logic from FileSourceStrategy.apply

2016-06-22 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master 9f990fa3f -> 37f3be5d2


[SPARK-16159][SQL] Move RDD creation logic from FileSourceStrategy.apply

## What changes were proposed in this pull request?
We embed partitioning logic in FileSourceStrategy.apply, making the function 
very long. This is a small refactoring to move it into its own functions. 
Eventually we would be able to move the partitioning functions into a physical 
operator, rather than doing it in physical planning.

## How was this patch tested?
This is a simple code move.

Author: Reynold Xin 

Closes #13862 from rxin/SPARK-16159.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37f3be5d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37f3be5d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37f3be5d

Branch: refs/heads/master
Commit: 37f3be5d29192db0a54f6c4699237b149bd0ecae
Parents: 9f990fa
Author: Reynold Xin 
Authored: Wed Jun 22 18:19:07 2016 -0700
Committer: Reynold Xin 
Committed: Wed Jun 22 18:19:07 2016 -0700

--
 .../sql/execution/datasources/FileScanRDD.scala |  26 +-
 .../datasources/FileSourceStrategy.scala| 240 +++
 2 files changed, 154 insertions(+), 112 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/37f3be5d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
index f7f68b1..1443057 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileScanRDD.scala
@@ -27,9 +27,14 @@ import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.vectorized.ColumnarBatch
 
 /**
- * A single file that should be read, along with partition column values that
- * need to be prepended to each row.  The reading should start at the first
- * valid record found after `start`.
+ * A part (i.e. "block") of a single file that should be read, along with 
partition column values
+ * that need to be prepended to each row.
+ *
+ * @param partitionValues value of partition columns to be prepended to each 
row.
+ * @param filePath path of the file to read
+ * @param start the beginning offset (in bytes) of the block.
+ * @param length number of bytes to read.
+ * @param locations locality information (list of nodes that have the data).
  */
 case class PartitionedFile(
 partitionValues: InternalRow,
@@ -43,13 +48,14 @@ case class PartitionedFile(
 }
 
 /**
- * A collection of files that should be read as a single task possibly from 
multiple partitioned
- * directories.
- *
- * TODO: This currently does not take locality information about the files 
into account.
+ * A collection of file blocks that should be read as a single task
+ * (possibly from multiple partitioned directories).
  */
 case class FilePartition(index: Int, files: Seq[PartitionedFile]) extends 
RDDPartition
 
+/**
+ * An RDD that scans a list of file partitions.
+ */
 class FileScanRDD(
 @transient private val sparkSession: SparkSession,
 readFunction: (PartitionedFile) => Iterator[InternalRow],
@@ -88,8 +94,8 @@ class FileScanRDD(
   private[this] var currentFile: PartitionedFile = null
   private[this] var currentIterator: Iterator[Object] = null
 
-  def hasNext = (currentIterator != null && currentIterator.hasNext) || 
nextIterator()
-  def next() = {
+  def hasNext: Boolean = (currentIterator != null && 
currentIterator.hasNext) || nextIterator()
+  def next(): Object = {
 val nextElement = currentIterator.next()
 // TODO: we should have a better separation of row based and batch 
based scan, so that we
 // don't need to run this `if` for every record.
@@ -120,7 +126,7 @@ class FileScanRDD(
 }
   }
 
-  override def close() = {
+  override def close(): Unit = {
 updateBytesRead()
 updateBytesReadWithFileSize()
 InputFileNameHolder.unsetInputFileName()

http://git-wip-us.apache.org/repos/asf/spark/blob/37f3be5d/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceStrategy.scala
index 13a86bf..04f166f 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/FileSourceSt

spark git commit: [SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables

2016-06-22 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e2eb8e002 -> e0a43235d


[SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables

 What changes were proposed in this pull request?
This PR is to improve test coverage. It verifies whether `Comment` of `Column` 
can be appropriate handled.

The test cases verify the related parts in Parser, both SQL and DataFrameWriter 
interface, and both Hive Metastore catalog and In-memory catalog.

 How was this patch tested?
N/A

Author: gatorsmile 

Closes #13764 from gatorsmile/dataSourceComment.

(cherry picked from commit 9f990fa3f9e0b798d8018cf4132b93a3468f33bb)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e0a43235
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e0a43235
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e0a43235

Branch: refs/heads/branch-2.0
Commit: e0a43235d9d59736ceb0d703c653ef1350e143ec
Parents: e2eb8e0
Author: gatorsmile 
Authored: Thu Jun 23 09:12:20 2016 +0800
Committer: Wenchen Fan 
Committed: Thu Jun 23 09:13:17 2016 +0800

--
 .../spark/sql/execution/command/DDLCommandSuite.scala | 10 +++---
 .../apache/spark/sql/execution/command/DDLSuite.scala | 13 +
 .../spark/sql/hive/execution/HiveDDLSuite.scala   | 14 ++
 3 files changed, 34 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e0a43235/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 5bee28b..7b96f4c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.execution.datasources.{BucketSpec, 
CreateTableUsing}
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
-import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
+import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType, 
StructType}
 
 
 // TODO: merge this with DDLSuite (SPARK-14441)
@@ -349,10 +349,14 @@ class DDLCommandSuite extends PlanTest {
   }
 
   test("create table using - with partitioned by") {
-val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet 
PARTITIONED BY (a)"
+val query = "CREATE TABLE my_tab(a INT comment 'test', b STRING) " +
+  "USING parquet PARTITIONED BY (a)"
 val expected = CreateTableUsing(
   TableIdentifier("my_tab"),
-  Some(new StructType().add("a", IntegerType).add("b", StringType)),
+  Some(new StructType()
+.add("a", IntegerType, nullable = true,
+  new MetadataBuilder().putString("comment", s"test").build())
+.add("b", StringType)),
   "parquet",
   false,
   Map.empty,

http://git-wip-us.apache.org/repos/asf/spark/blob/e0a43235/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index f40ddcc..47d8a28 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -252,6 +252,19 @@ class DDLSuite extends QueryTest with SharedSQLContext 
with BeforeAndAfterEach {
 }
   }
 
+  test("desc table for parquet data source table using in-memory catalog") {
+assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory")
+val tabName = "tab1"
+withTable(tabName) {
+  sql(s"CREATE TABLE $tabName(a int comment 'test') USING parquet ")
+
+  checkAnswer(
+sql(s"DESC $tabName").select("col_name", "data_type", "comment"),
+Row("a", "int", "test")
+  )
+}
+  }
+
   test("Alter/Describe Database") {
 withTempDir { tmpDir =>
   val path = tmpDir.toString

http://git-wip-us.apache.org/repos/asf/spark/blob/e0a43235/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLS

spark git commit: [SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables

2016-06-22 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/master 4f869f88e -> 9f990fa3f


[SPARK-16024][SQL][TEST] Verify Column Comment for Data Source Tables

 What changes were proposed in this pull request?
This PR is to improve test coverage. It verifies whether `Comment` of `Column` 
can be appropriate handled.

The test cases verify the related parts in Parser, both SQL and DataFrameWriter 
interface, and both Hive Metastore catalog and In-memory catalog.

 How was this patch tested?
N/A

Author: gatorsmile 

Closes #13764 from gatorsmile/dataSourceComment.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9f990fa3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9f990fa3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9f990fa3

Branch: refs/heads/master
Commit: 9f990fa3f9e0b798d8018cf4132b93a3468f33bb
Parents: 4f869f8
Author: gatorsmile 
Authored: Thu Jun 23 09:12:20 2016 +0800
Committer: Wenchen Fan 
Committed: Thu Jun 23 09:12:20 2016 +0800

--
 .../spark/sql/execution/command/DDLCommandSuite.scala | 10 +++---
 .../apache/spark/sql/execution/command/DDLSuite.scala | 13 +
 .../spark/sql/hive/execution/HiveDDLSuite.scala   | 14 ++
 3 files changed, 34 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9f990fa3/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
index 5bee28b..7b96f4c 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLCommandSuite.scala
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Project
 import org.apache.spark.sql.execution.SparkSqlParser
 import org.apache.spark.sql.execution.datasources.{BucketSpec, 
CreateTableUsing}
 import org.apache.spark.sql.internal.{HiveSerDe, SQLConf}
-import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
+import org.apache.spark.sql.types.{IntegerType, MetadataBuilder, StringType, 
StructType}
 
 
 // TODO: merge this with DDLSuite (SPARK-14441)
@@ -349,10 +349,14 @@ class DDLCommandSuite extends PlanTest {
   }
 
   test("create table using - with partitioned by") {
-val query = "CREATE TABLE my_tab(a INT, b STRING) USING parquet 
PARTITIONED BY (a)"
+val query = "CREATE TABLE my_tab(a INT comment 'test', b STRING) " +
+  "USING parquet PARTITIONED BY (a)"
 val expected = CreateTableUsing(
   TableIdentifier("my_tab"),
-  Some(new StructType().add("a", IntegerType).add("b", StringType)),
+  Some(new StructType()
+.add("a", IntegerType, nullable = true,
+  new MetadataBuilder().putString("comment", s"test").build())
+.add("b", StringType)),
   "parquet",
   false,
   Map.empty,

http://git-wip-us.apache.org/repos/asf/spark/blob/9f990fa3/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
index f40ddcc..47d8a28 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/command/DDLSuite.scala
@@ -252,6 +252,19 @@ class DDLSuite extends QueryTest with SharedSQLContext 
with BeforeAndAfterEach {
 }
   }
 
+  test("desc table for parquet data source table using in-memory catalog") {
+assume(spark.sparkContext.conf.get(CATALOG_IMPLEMENTATION) == "in-memory")
+val tabName = "tab1"
+withTable(tabName) {
+  sql(s"CREATE TABLE $tabName(a int comment 'test') USING parquet ")
+
+  checkAnswer(
+sql(s"DESC $tabName").select("col_name", "data_type", "comment"),
+Row("a", "int", "test")
+  )
+}
+  }
+
   test("Alter/Describe Database") {
 withTempDir { tmpDir =>
   val path = tmpDir.toString

http://git-wip-us.apache.org/repos/asf/spark/blob/9f990fa3/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
index b2f01fc..89f69c8 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/executio

spark git commit: [SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime

2016-06-22 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/master 044971eca -> 4f869f88e


[SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime

## What changes were proposed in this pull request?

Extend the returning of unwrapper functions from primitive types to all types.

This PR is based on https://github.com/apache/spark/pull/13676. It only fixes a 
bug with scala-2.10 compilation. All credit should go to dafrista.

## How was this patch tested?

The patch should pass all unit tests. Reading ORC files with non-primitive 
types with this change reduced the read time by ~15%.

Author: Brian Cho 
Author: Herman van Hovell 

Closes #13854 from hvanhovell/SPARK-15956-scala210.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4f869f88
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4f869f88
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4f869f88

Branch: refs/heads/master
Commit: 4f869f88ee96fa57be79f972f218111b6feac67f
Parents: 044971e
Author: Brian Cho 
Authored: Wed Jun 22 16:56:55 2016 -0700
Committer: Herman van Hovell 
Committed: Wed Jun 22 16:56:55 2016 -0700

--
 .../apache/spark/sql/hive/HiveInspectors.scala  | 428 +--
 .../org/apache/spark/sql/hive/TableReader.scala |   3 +-
 .../hive/execution/ScriptTransformation.scala   |   6 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala|  21 +-
 .../spark/sql/hive/HiveInspectorSuite.scala |   6 +
 5 files changed, 314 insertions(+), 150 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4f869f88/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 585befe..bf5cc17 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -239,145 +239,6 @@ private[hive] trait HiveInspectors {
   }
 
   /**
-   * Converts hive types to native catalyst types.
-   * @param data the data in Hive type
-   * @param oi   the ObjectInspector associated with the Hive Type
-   * @return convert the data into catalyst type
-   * TODO return the function of (data => Any) instead for performance 
consideration
-   *
-   * Strictly follows the following order in unwrapping (constant OI has the 
higher priority):
-   *  Constant Null object inspector =>
-   *return null
-   *  Constant object inspector =>
-   *extract the value from constant object inspector
-   *  Check whether the `data` is null =>
-   *return null if true
-   *  If object inspector prefers writable =>
-   *extract writable from `data` and then get the catalyst type from the 
writable
-   *  Extract the java object directly from the object inspector
-   *
-   *  NOTICE: the complex data type requires recursive unwrapping.
-   */
-  def unwrap(data: Any, oi: ObjectInspector): Any = oi match {
-case coi: ConstantObjectInspector if coi.getWritableConstantValue == null 
=> null
-case poi: WritableConstantStringObjectInspector =>
-  UTF8String.fromString(poi.getWritableConstantValue.toString)
-case poi: WritableConstantHiveVarcharObjectInspector =>
-  
UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue)
-case poi: WritableConstantHiveCharObjectInspector =>
-  UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue)
-case poi: WritableConstantHiveDecimalObjectInspector =>
-  HiveShim.toCatalystDecimal(
-PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
-poi.getWritableConstantValue.getHiveDecimal)
-case poi: WritableConstantTimestampObjectInspector =>
-  val t = poi.getWritableConstantValue
-  t.getSeconds * 100L + t.getNanos / 1000L
-case poi: WritableConstantIntObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantDoubleObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantBooleanObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantLongObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantFloatObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantShortObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantByteObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantBinaryObjectInspector =>
-  val writable = poi.getWritableConstantValue
-  val temp = new Array[Byte](writable.getLength)
-

spark git commit: [SPARK-16131] initialize internal logger lazily in Scala preferred way

2016-06-22 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 1d3c56e77 -> e2eb8e002


[SPARK-16131] initialize internal logger lazily in Scala preferred way

## What changes were proposed in this pull request?

Initialize logger instance lazily in Scala preferred way

## How was this patch tested?

By running `./build/mvn clean test` locally

Author: Prajwal Tuladhar 

Closes #13842 from infynyxx/spark_internal_logger.

(cherry picked from commit 044971eca0ff3c2ce62afa665dbd3072d52cbbec)
Signed-off-by: Shixiong Zhu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2eb8e00
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2eb8e00
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2eb8e00

Branch: refs/heads/branch-2.0
Commit: e2eb8e002acb19fd266d2237baec31f74aa02ef8
Parents: 1d3c56e
Author: Prajwal Tuladhar 
Authored: Wed Jun 22 16:30:10 2016 -0700
Committer: Shixiong Zhu 
Committed: Wed Jun 22 16:30:18 2016 -0700

--
 .../scala/org/apache/spark/internal/Logging.scala | 14 --
 .../cluster/CoarseGrainedSchedulerBackend.scala   |  2 --
 2 files changed, 4 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e2eb8e00/core/src/main/scala/org/apache/spark/internal/Logging.scala
--
diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala 
b/core/src/main/scala/org/apache/spark/internal/Logging.scala
index 66a0cfe..c51050c 100644
--- a/core/src/main/scala/org/apache/spark/internal/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -32,7 +32,10 @@ private[spark] trait Logging {
 
   // Make the log field transient so that objects with Logging can
   // be serialized and used on another machine
-  @transient private var log_ : Logger = null
+  @transient lazy val log: Logger = {
+initializeLogIfNecessary(false)
+LoggerFactory.getLogger(logName)
+  }
 
   // Method to get the logger name for this object
   protected def logName = {
@@ -40,15 +43,6 @@ private[spark] trait Logging {
 this.getClass.getName.stripSuffix("$")
   }
 
-  // Method to get or create the logger for this object
-  protected def log: Logger = {
-if (log_ == null) {
-  initializeLogIfNecessary(false)
-  log_ = LoggerFactory.getLogger(logName)
-}
-log_
-  }
-
   // Log methods that take only a String
   protected def logInfo(msg: => String) {
 if (log.isInfoEnabled) log.info(msg)

http://git-wip-us.apache.org/repos/asf/spark/blob/e2eb8e00/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 967c4d5..8259923 100644
--- 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -100,8 +100,6 @@ class CoarseGrainedSchedulerBackend(scheduler: 
TaskSchedulerImpl, val rpcEnv: Rp
 // instance across threads
 private val ser = SparkEnv.get.closureSerializer.newInstance()
 
-override protected def log = CoarseGrainedSchedulerBackend.this.log
-
 protected val addressToExecutorId = new HashMap[RpcAddress, String]
 
 private val reviveThread =


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16131] initialize internal logger lazily in Scala preferred way

2016-06-22 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/master 857ecff1d -> 044971eca


[SPARK-16131] initialize internal logger lazily in Scala preferred way

## What changes were proposed in this pull request?

Initialize logger instance lazily in Scala preferred way

## How was this patch tested?

By running `./build/mvn clean test` locally

Author: Prajwal Tuladhar 

Closes #13842 from infynyxx/spark_internal_logger.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/044971ec
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/044971ec
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/044971ec

Branch: refs/heads/master
Commit: 044971eca0ff3c2ce62afa665dbd3072d52cbbec
Parents: 857ecff
Author: Prajwal Tuladhar 
Authored: Wed Jun 22 16:30:10 2016 -0700
Committer: Shixiong Zhu 
Committed: Wed Jun 22 16:30:10 2016 -0700

--
 .../scala/org/apache/spark/internal/Logging.scala | 14 --
 .../cluster/CoarseGrainedSchedulerBackend.scala   |  2 --
 2 files changed, 4 insertions(+), 12 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/044971ec/core/src/main/scala/org/apache/spark/internal/Logging.scala
--
diff --git a/core/src/main/scala/org/apache/spark/internal/Logging.scala 
b/core/src/main/scala/org/apache/spark/internal/Logging.scala
index 66a0cfe..c51050c 100644
--- a/core/src/main/scala/org/apache/spark/internal/Logging.scala
+++ b/core/src/main/scala/org/apache/spark/internal/Logging.scala
@@ -32,7 +32,10 @@ private[spark] trait Logging {
 
   // Make the log field transient so that objects with Logging can
   // be serialized and used on another machine
-  @transient private var log_ : Logger = null
+  @transient lazy val log: Logger = {
+initializeLogIfNecessary(false)
+LoggerFactory.getLogger(logName)
+  }
 
   // Method to get the logger name for this object
   protected def logName = {
@@ -40,15 +43,6 @@ private[spark] trait Logging {
 this.getClass.getName.stripSuffix("$")
   }
 
-  // Method to get or create the logger for this object
-  protected def log: Logger = {
-if (log_ == null) {
-  initializeLogIfNecessary(false)
-  log_ = LoggerFactory.getLogger(logName)
-}
-log_
-  }
-
   // Log methods that take only a String
   protected def logInfo(msg: => String) {
 if (log.isInfoEnabled) log.info(msg)

http://git-wip-us.apache.org/repos/asf/spark/blob/044971ec/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
index 967c4d5..8259923 100644
--- 
a/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
+++ 
b/core/src/main/scala/org/apache/spark/scheduler/cluster/CoarseGrainedSchedulerBackend.scala
@@ -100,8 +100,6 @@ class CoarseGrainedSchedulerBackend(scheduler: 
TaskSchedulerImpl, val rpcEnv: Rp
 // instance across threads
 private val ser = SparkEnv.get.closureSerializer.newInstance()
 
-override protected def log = CoarseGrainedSchedulerBackend.this.log
-
 protected val addressToExecutorId = new HashMap[RpcAddress, String]
 
 private val reviveThread =


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16155][DOC] remove package grouping in Java docs

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 02435acf3 -> 1d3c56e77


[SPARK-16155][DOC] remove package grouping in Java docs

## What changes were proposed in this pull request?

In 1.4 and earlier releases, we have package grouping in the generated Java API 
docs. See http://spark.apache.org/docs/1.4.0/api/java/index.html. However, this 
disappeared in 1.5.0: http://spark.apache.org/docs/1.5.0/api/java/index.html.

Rather than fixing it, I'd suggest removing grouping. Because it might take 
some time to fix and it is a manual process to update the grouping in 
`SparkBuild.scala`. I didn't find anyone complaining about missing groups since 
1.5.0 on Google.

Manually checked the generated Java API docs and confirmed that they are the 
same as in master.

Author: Xiangrui Meng 

Closes #13856 from mengxr/SPARK-16155.

(cherry picked from commit 857ecff1d8268b28bb287e47cda370c87afe9d41)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d3c56e7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d3c56e7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d3c56e7

Branch: refs/heads/branch-2.0
Commit: 1d3c56e778b28ad4587d07765896814bfc1201f4
Parents: 02435ac
Author: Xiangrui Meng 
Authored: Wed Jun 22 15:52:37 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 15:52:47 2016 -0700

--
 project/SparkBuild.scala | 20 
 1 file changed, 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1d3c56e7/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index bce7f1d..4b44469 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -684,11 +684,6 @@ object Unidoc {
   import sbtunidoc.Plugin._
   import UnidocKeys._
 
-  // for easier specification of JavaDoc package groups
-  private def packageList(names: String*): String = {
-names.map(s => "org.apache.spark." + s).mkString(":")
-  }
-
   private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): 
Seq[Seq[File]] = {
 packages
   .map(_.filterNot(_.getName.contains("$")))
@@ -731,21 +726,6 @@ object Unidoc {
 javacOptions in doc := Seq(
   "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " 
JavaDoc",
   "-public",
-  "-group", "Core Java API", packageList("api.java", "api.java.function"),
-  "-group", "Spark Streaming", packageList(
-"streaming.api.java", "streaming.flume", "streaming.kafka", 
"streaming.kinesis"
-  ),
-  "-group", "MLlib", packageList(
-"mllib.classification", "mllib.clustering", "mllib.evaluation.binary", 
"mllib.linalg",
-"mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", 
"mllib.recommendation",
-"mllib.regression", "mllib.stat", "mllib.tree", 
"mllib.tree.configuration",
-"mllib.tree.impurity", "mllib.tree.model", "mllib.util",
-"mllib.evaluation", "mllib.feature", "mllib.random", 
"mllib.stat.correlation",
-"mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss",
-"ml", "ml.attribute", "ml.classification", "ml.clustering", 
"ml.evaluation", "ml.feature",
-"ml.param", "ml.recommendation", "ml.regression", "ml.tuning"
-  ),
-  "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", 
"sql.hive.api.java"),
   "-noqualifier", "java.lang"
 ),
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16155][DOC] remove package grouping in Java docs

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 00cc5cca4 -> 857ecff1d


[SPARK-16155][DOC] remove package grouping in Java docs

## What changes were proposed in this pull request?

In 1.4 and earlier releases, we have package grouping in the generated Java API 
docs. See http://spark.apache.org/docs/1.4.0/api/java/index.html. However, this 
disappeared in 1.5.0: http://spark.apache.org/docs/1.5.0/api/java/index.html.

Rather than fixing it, I'd suggest removing grouping. Because it might take 
some time to fix and it is a manual process to update the grouping in 
`SparkBuild.scala`. I didn't find anyone complaining about missing groups since 
1.5.0 on Google.

Manually checked the generated Java API docs and confirmed that they are the 
same as in master.

Author: Xiangrui Meng 

Closes #13856 from mengxr/SPARK-16155.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/857ecff1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/857ecff1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/857ecff1

Branch: refs/heads/master
Commit: 857ecff1d8268b28bb287e47cda370c87afe9d41
Parents: 00cc5cc
Author: Xiangrui Meng 
Authored: Wed Jun 22 15:52:37 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 15:52:37 2016 -0700

--
 project/SparkBuild.scala | 20 
 1 file changed, 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/857ecff1/project/SparkBuild.scala
--
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
index bce7f1d..4b44469 100644
--- a/project/SparkBuild.scala
+++ b/project/SparkBuild.scala
@@ -684,11 +684,6 @@ object Unidoc {
   import sbtunidoc.Plugin._
   import UnidocKeys._
 
-  // for easier specification of JavaDoc package groups
-  private def packageList(names: String*): String = {
-names.map(s => "org.apache.spark." + s).mkString(":")
-  }
-
   private def ignoreUndocumentedPackages(packages: Seq[Seq[File]]): 
Seq[Seq[File]] = {
 packages
   .map(_.filterNot(_.getName.contains("$")))
@@ -731,21 +726,6 @@ object Unidoc {
 javacOptions in doc := Seq(
   "-windowtitle", "Spark " + version.value.replaceAll("-SNAPSHOT", "") + " 
JavaDoc",
   "-public",
-  "-group", "Core Java API", packageList("api.java", "api.java.function"),
-  "-group", "Spark Streaming", packageList(
-"streaming.api.java", "streaming.flume", "streaming.kafka", 
"streaming.kinesis"
-  ),
-  "-group", "MLlib", packageList(
-"mllib.classification", "mllib.clustering", "mllib.evaluation.binary", 
"mllib.linalg",
-"mllib.linalg.distributed", "mllib.optimization", "mllib.rdd", 
"mllib.recommendation",
-"mllib.regression", "mllib.stat", "mllib.tree", 
"mllib.tree.configuration",
-"mllib.tree.impurity", "mllib.tree.model", "mllib.util",
-"mllib.evaluation", "mllib.feature", "mllib.random", 
"mllib.stat.correlation",
-"mllib.stat.test", "mllib.tree.impl", "mllib.tree.loss",
-"ml", "ml.attribute", "ml.classification", "ml.clustering", 
"ml.evaluation", "ml.feature",
-"ml.param", "ml.recommendation", "ml.regression", "ml.tuning"
-  ),
-  "-group", "Spark SQL", packageList("sql.api.java", "sql.api.java.types", 
"sql.hive.api.java"),
   "-noqualifier", "java.lang"
 ),
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 282a3cd02 -> 02435acf3


[SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug

## What changes were proposed in this pull request?

We recently deprecated setLabelCol in ChiSqSelectorModel (#13823):

~~~scala
  /** group setParam */
  Since("1.6.0")
  deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)
~~~

This unfortunately hit a genjavadoc bug and broken doc generation. This is the 
generated Java code:

~~~java
  /** group setParam */
  public  org.apache.spark.ml.feature.ChiSqSelectorModel setOutputCol 
(java.lang.String value)  { throw new RuntimeException(); }
   *
   * deprecated labelCol is not used by ChiSqSelectorModel. Since 2.0.0.
  */
  public  org.apache.spark.ml.feature.ChiSqSelectorModel setLabelCol 
(java.lang.String value)  { throw new RuntimeException(); }
~~~

Switching to multiline is a workaround.

Author: Xiangrui Meng 

Closes #13855 from mengxr/SPARK-16153.

(cherry picked from commit 00cc5cca4522297b63b1522a2b8643b1a098e2b3)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/02435acf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/02435acf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/02435acf

Branch: refs/heads/branch-2.0
Commit: 02435acf3bf84f77bb3c70a2fd548af8bad4c28e
Parents: 282a3cd
Author: Xiangrui Meng 
Authored: Wed Jun 22 15:50:21 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 15:50:28 2016 -0700

--
 .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/02435acf/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 38b4db9..712634d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -140,7 +140,9 @@ final class ChiSqSelectorModel private[ml] (
   @Since("1.6.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group setParam */
+  /**
+   * @group setParam
+   */
   @Since("1.6.0")
   @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 20d411bc5 -> 00cc5cca4


[SPARK-16153][MLLIB] switch to multi-line doc to avoid a genjavadoc bug

## What changes were proposed in this pull request?

We recently deprecated setLabelCol in ChiSqSelectorModel (#13823):

~~~scala
  /** group setParam */
  Since("1.6.0")
  deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)
~~~

This unfortunately hit a genjavadoc bug and broken doc generation. This is the 
generated Java code:

~~~java
  /** group setParam */
  public  org.apache.spark.ml.feature.ChiSqSelectorModel setOutputCol 
(java.lang.String value)  { throw new RuntimeException(); }
   *
   * deprecated labelCol is not used by ChiSqSelectorModel. Since 2.0.0.
  */
  public  org.apache.spark.ml.feature.ChiSqSelectorModel setLabelCol 
(java.lang.String value)  { throw new RuntimeException(); }
~~~

Switching to multiline is a workaround.

Author: Xiangrui Meng 

Closes #13855 from mengxr/SPARK-16153.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00cc5cca
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00cc5cca
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00cc5cca

Branch: refs/heads/master
Commit: 00cc5cca4522297b63b1522a2b8643b1a098e2b3
Parents: 20d411b
Author: Xiangrui Meng 
Authored: Wed Jun 22 15:50:21 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 15:50:21 2016 -0700

--
 .../main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/00cc5cca/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
index 38b4db9..712634d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/ChiSqSelector.scala
@@ -140,7 +140,9 @@ final class ChiSqSelectorModel private[ml] (
   @Since("1.6.0")
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  /** @group setParam */
+  /**
+   * @group setParam
+   */
   @Since("1.6.0")
   @deprecated("labelCol is not used by ChiSqSelectorModel.", "2.0.0")
   def setLabelCol(value: String): this.type = set(labelCol, value)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on local timezone

2016-06-22 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 299f427b7 -> 282a3cd02


[SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on 
local timezone

## What changes were proposed in this pull request?

Currently, we use local timezone to parse or format a timestamp 
(TimestampType), then use Long as the microseconds since epoch UTC.

In from_utc_timestamp() and to_utc_timestamp(), we did not consider the local 
timezone, they could return different results with different local timezone.

This PR will do the conversion based on human time (in local timezone), it 
should return same result in whatever timezone. But because the mapping from 
absolute timestamp to human time is not exactly one-to-one mapping, it will 
still return wrong result in some timezone (also in the begging or ending of 
DST).

This PR is kind of the best effort fix. In long term, we should make the 
TimestampType be timezone aware to fix this totally.

## How was this patch tested?

Tested these function in all timezone.

Author: Davies Liu 

Closes #13784 from davies/convert_tz.

(cherry picked from commit 20d411bc5d05dd099f6d5234a24e10a519a39bdf)
Signed-off-by: Herman van Hovell 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/282a3cd0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/282a3cd0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/282a3cd0

Branch: refs/heads/branch-2.0
Commit: 282a3cd02389464d6adbf02921281c963da29b00
Parents: 299f427
Author: Davies Liu 
Authored: Wed Jun 22 13:40:24 2016 -0700
Committer: Herman van Hovell 
Committed: Wed Jun 22 13:41:33 2016 -0700

--
 .../expressions/datetimeExpressions.scala   | 10 +--
 .../spark/sql/catalyst/util/DateTimeUtils.scala | 34 --
 .../sql/catalyst/util/DateTimeUtilsSuite.scala  | 65 
 3 files changed, 73 insertions(+), 36 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/282a3cd0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 773431d..04c17bd 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -730,16 +730,17 @@ case class FromUTCTimestamp(left: Expression, right: 
Expression)
  """.stripMargin)
   } else {
 val tzTerm = ctx.freshName("tz")
+val utcTerm = ctx.freshName("utc")
 val tzClass = classOf[TimeZone].getName
 ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = 
$tzClass.getTimeZone("$tz");""")
+ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = 
$tzClass.getTimeZone("UTC");""")
 val eval = left.genCode(ctx)
 ev.copy(code = s"""
|${eval.code}
|boolean ${ev.isNull} = ${eval.isNull};
|long ${ev.value} = 0;
|if (!${ev.isNull}) {
-   |  ${ev.value} = ${eval.value} +
-   |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
+   |  ${ev.value} = $dtu.convertTz(${eval.value}, $utcTerm, $tzTerm);
|}
  """.stripMargin)
   }
@@ -869,16 +870,17 @@ case class ToUTCTimestamp(left: Expression, right: 
Expression)
  """.stripMargin)
   } else {
 val tzTerm = ctx.freshName("tz")
+val utcTerm = ctx.freshName("utc")
 val tzClass = classOf[TimeZone].getName
 ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = 
$tzClass.getTimeZone("$tz");""")
+ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = 
$tzClass.getTimeZone("UTC");""")
 val eval = left.genCode(ctx)
 ev.copy(code = s"""
|${eval.code}
|boolean ${ev.isNull} = ${eval.isNull};
|long ${ev.value} = 0;
|if (!${ev.isNull}) {
-   |  ${ev.value} = ${eval.value} -
-   |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
+   |  ${ev.value} = $dtu.convertTz(${eval.value}, $tzTerm, $utcTerm);
|}
  """.stripMargin)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/282a3cd0/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 56bf9a7..df480a

spark git commit: [SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on local timezone

2016-06-22 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/master 43b04b7ec -> 20d411bc5


[SPARK-16078][SQL] from_utc_timestamp/to_utc_timestamp should not depends on 
local timezone

## What changes were proposed in this pull request?

Currently, we use local timezone to parse or format a timestamp 
(TimestampType), then use Long as the microseconds since epoch UTC.

In from_utc_timestamp() and to_utc_timestamp(), we did not consider the local 
timezone, they could return different results with different local timezone.

This PR will do the conversion based on human time (in local timezone), it 
should return same result in whatever timezone. But because the mapping from 
absolute timestamp to human time is not exactly one-to-one mapping, it will 
still return wrong result in some timezone (also in the begging or ending of 
DST).

This PR is kind of the best effort fix. In long term, we should make the 
TimestampType be timezone aware to fix this totally.

## How was this patch tested?

Tested these function in all timezone.

Author: Davies Liu 

Closes #13784 from davies/convert_tz.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20d411bc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20d411bc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20d411bc

Branch: refs/heads/master
Commit: 20d411bc5d05dd099f6d5234a24e10a519a39bdf
Parents: 43b04b7
Author: Davies Liu 
Authored: Wed Jun 22 13:40:24 2016 -0700
Committer: Herman van Hovell 
Committed: Wed Jun 22 13:40:24 2016 -0700

--
 .../expressions/datetimeExpressions.scala   | 10 +--
 .../spark/sql/catalyst/util/DateTimeUtils.scala | 34 --
 .../sql/catalyst/util/DateTimeUtilsSuite.scala  | 65 
 3 files changed, 73 insertions(+), 36 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/20d411bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
index 773431d..04c17bd 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/datetimeExpressions.scala
@@ -730,16 +730,17 @@ case class FromUTCTimestamp(left: Expression, right: 
Expression)
  """.stripMargin)
   } else {
 val tzTerm = ctx.freshName("tz")
+val utcTerm = ctx.freshName("utc")
 val tzClass = classOf[TimeZone].getName
 ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = 
$tzClass.getTimeZone("$tz");""")
+ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = 
$tzClass.getTimeZone("UTC");""")
 val eval = left.genCode(ctx)
 ev.copy(code = s"""
|${eval.code}
|boolean ${ev.isNull} = ${eval.isNull};
|long ${ev.value} = 0;
|if (!${ev.isNull}) {
-   |  ${ev.value} = ${eval.value} +
-   |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
+   |  ${ev.value} = $dtu.convertTz(${eval.value}, $utcTerm, $tzTerm);
|}
  """.stripMargin)
   }
@@ -869,16 +870,17 @@ case class ToUTCTimestamp(left: Expression, right: 
Expression)
  """.stripMargin)
   } else {
 val tzTerm = ctx.freshName("tz")
+val utcTerm = ctx.freshName("utc")
 val tzClass = classOf[TimeZone].getName
 ctx.addMutableState(tzClass, tzTerm, s"""$tzTerm = 
$tzClass.getTimeZone("$tz");""")
+ctx.addMutableState(tzClass, utcTerm, s"""$utcTerm = 
$tzClass.getTimeZone("UTC");""")
 val eval = left.genCode(ctx)
 ev.copy(code = s"""
|${eval.code}
|boolean ${ev.isNull} = ${eval.isNull};
|long ${ev.value} = 0;
|if (!${ev.isNull}) {
-   |  ${ev.value} = ${eval.value} -
-   |   ${tzTerm}.getOffset(${eval.value} / 1000) * 1000L;
+   |  ${ev.value} = $dtu.convertTz(${eval.value}, $tzTerm, $utcTerm);
|}
  """.stripMargin)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/20d411bc/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 56bf9a7..df480a1 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/ca

spark git commit: [SPARK-15672][R][DOC] R programming guide update

2016-06-22 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e043c02d0 -> 299f427b7


[SPARK-15672][R][DOC] R programming guide update

## What changes were proposed in this pull request?
Guide for
- UDFs with dapply, dapplyCollect
- spark.lapply for running parallel R functions

## How was this patch tested?
build locally
https://cloud.githubusercontent.com/assets/3419881/16039344/12a3b6a0-31de-11e6-8d77-fe23308075c0.png";>

Author: Kai Jiang 

Closes #13660 from vectorijk/spark-15672-R-guide-update.

(cherry picked from commit 43b04b7ecb313a2cee6121dd575de1f7dc785c11)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/299f427b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/299f427b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/299f427b

Branch: refs/heads/branch-2.0
Commit: 299f427b70f8dedbc0b554f83c4fde408caf4d15
Parents: e043c02
Author: Kai Jiang 
Authored: Wed Jun 22 12:50:36 2016 -0700
Committer: Joseph K. Bradley 
Committed: Wed Jun 22 12:50:44 2016 -0700

--
 R/pkg/R/context.R |  2 +-
 docs/sparkr.md| 77 ++
 2 files changed, 78 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/299f427b/R/pkg/R/context.R
--
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 96ef943..dd0ceae 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -246,7 +246,7 @@ setCheckpointDir <- function(sc, dirName) {
 #'   \preformatted{
 #' train <- function(hyperparam) {
 #'   library(MASS)
-#'   lm.ridge(ây ~ x+zâ, data, lambda=hyperparam)
+#'   lm.ridge("y ~ x+z", data, lambda=hyperparam)
 #'   model
 #' }
 #'   }

http://git-wip-us.apache.org/repos/asf/spark/blob/299f427b/docs/sparkr.md
--
diff --git a/docs/sparkr.md b/docs/sparkr.md
index f018901..9e74e4a 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -255,6 +255,83 @@ head(df)
 {% endhighlight %}
 
 
+### Applying User-Defined Function
+In SparkR, we support several kinds of User-Defined Functions:
+
+ Run a given function on a large dataset using `dapply` or `dapplyCollect`
+
+# dapply
+Apply a function to each partition of a `SparkDataFrame`. The function to be 
applied to each partition of the `SparkDataFrame`
+and should have only one parameter, to which a `data.frame` corresponds to 
each partition will be passed. The output of function
+should be a `data.frame`. Schema specifies the row format of the resulting a 
`SparkDataFrame`. It must match the R function's output.
+
+{% highlight r %}
+
+# Convert waiting time from hours to seconds.
+# Note that we can apply UDF to DataFrame.
+schema <- structType(structField("eruptions", "double"), 
structField("waiting", "double"),
+ structField("waiting_secs", "double"))
+df1 <- dapply(df, function(x) {x <- cbind(x, x$waiting * 60)}, schema)
+head(collect(df1))
+##  eruptions waiting waiting_secs
+##1 3.600  79 4740
+##2 1.800  54 3240
+##3 3.333  74 4440
+##4 2.283  62 3720
+##5 4.533  85 5100
+##6 2.883  55 3300
+{% endhighlight %}
+
+
+# dapplyCollect
+Like `dapply`, apply a function to each partition of a `SparkDataFrame` and 
collect the result back. The output of function
+should be a `data.frame`. But, Schema is not required to be passed. Note that 
`dapplyCollect` only can be used if the
+output of UDF run on all the partitions can fit in driver memory.
+
+{% highlight r %}
+
+# Convert waiting time from hours to seconds.
+# Note that we can apply UDF to DataFrame and return a R's data.frame
+ldf <- dapplyCollect(
+ df,
+ function(x) {
+   x <- cbind(x, "waiting_secs" = x$waiting * 60)
+ })
+head(ldf, 3)
+##  eruptions waiting waiting_secs
+##1 3.600  79 4740
+##2 1.800  54 3240
+##3 3.333  74 4440
+
+{% endhighlight %}
+
+
+ Run local R functions distributed using `spark.lapply`
+
+# spark.lapply
+Similar to `lapply` in native R, `spark.lapply` runs a function over a list of 
elements and distributes the computations with Spark.
+Applies a function in a manner that is similar to `doParallel` or `lapply` to 
elements of a list. The results of all the computations
+should fit in a single machine. If that is not the case they can do something 
like `df <- createDataFrame(list)` and then use
+`dapply`
+
+{% highlight r %}
+
+# Perform distributed training of multiple models with spark.lapply. Here, we 
pass
+# a read-only list of arguments which specifies family the generalized linear 
model shou

spark git commit: [SPARK-15672][R][DOC] R programming guide update

2016-06-22 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 6f915c9ec -> 43b04b7ec


[SPARK-15672][R][DOC] R programming guide update

## What changes were proposed in this pull request?
Guide for
- UDFs with dapply, dapplyCollect
- spark.lapply for running parallel R functions

## How was this patch tested?
build locally
https://cloud.githubusercontent.com/assets/3419881/16039344/12a3b6a0-31de-11e6-8d77-fe23308075c0.png";>

Author: Kai Jiang 

Closes #13660 from vectorijk/spark-15672-R-guide-update.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/43b04b7e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/43b04b7e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/43b04b7e

Branch: refs/heads/master
Commit: 43b04b7ecb313a2cee6121dd575de1f7dc785c11
Parents: 6f915c9
Author: Kai Jiang 
Authored: Wed Jun 22 12:50:36 2016 -0700
Committer: Joseph K. Bradley 
Committed: Wed Jun 22 12:50:36 2016 -0700

--
 R/pkg/R/context.R |  2 +-
 docs/sparkr.md| 77 ++
 2 files changed, 78 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/43b04b7e/R/pkg/R/context.R
--
diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R
index 96ef943..dd0ceae 100644
--- a/R/pkg/R/context.R
+++ b/R/pkg/R/context.R
@@ -246,7 +246,7 @@ setCheckpointDir <- function(sc, dirName) {
 #'   \preformatted{
 #' train <- function(hyperparam) {
 #'   library(MASS)
-#'   lm.ridge(ây ~ x+zâ, data, lambda=hyperparam)
+#'   lm.ridge("y ~ x+z", data, lambda=hyperparam)
 #'   model
 #' }
 #'   }

http://git-wip-us.apache.org/repos/asf/spark/blob/43b04b7e/docs/sparkr.md
--
diff --git a/docs/sparkr.md b/docs/sparkr.md
index f018901..9e74e4a 100644
--- a/docs/sparkr.md
+++ b/docs/sparkr.md
@@ -255,6 +255,83 @@ head(df)
 {% endhighlight %}
 
 
+### Applying User-Defined Function
+In SparkR, we support several kinds of User-Defined Functions:
+
+ Run a given function on a large dataset using `dapply` or `dapplyCollect`
+
+# dapply
+Apply a function to each partition of a `SparkDataFrame`. The function to be 
applied to each partition of the `SparkDataFrame`
+and should have only one parameter, to which a `data.frame` corresponds to 
each partition will be passed. The output of function
+should be a `data.frame`. Schema specifies the row format of the resulting a 
`SparkDataFrame`. It must match the R function's output.
+
+{% highlight r %}
+
+# Convert waiting time from hours to seconds.
+# Note that we can apply UDF to DataFrame.
+schema <- structType(structField("eruptions", "double"), 
structField("waiting", "double"),
+ structField("waiting_secs", "double"))
+df1 <- dapply(df, function(x) {x <- cbind(x, x$waiting * 60)}, schema)
+head(collect(df1))
+##  eruptions waiting waiting_secs
+##1 3.600  79 4740
+##2 1.800  54 3240
+##3 3.333  74 4440
+##4 2.283  62 3720
+##5 4.533  85 5100
+##6 2.883  55 3300
+{% endhighlight %}
+
+
+# dapplyCollect
+Like `dapply`, apply a function to each partition of a `SparkDataFrame` and 
collect the result back. The output of function
+should be a `data.frame`. But, Schema is not required to be passed. Note that 
`dapplyCollect` only can be used if the
+output of UDF run on all the partitions can fit in driver memory.
+
+{% highlight r %}
+
+# Convert waiting time from hours to seconds.
+# Note that we can apply UDF to DataFrame and return a R's data.frame
+ldf <- dapplyCollect(
+ df,
+ function(x) {
+   x <- cbind(x, "waiting_secs" = x$waiting * 60)
+ })
+head(ldf, 3)
+##  eruptions waiting waiting_secs
+##1 3.600  79 4740
+##2 1.800  54 3240
+##3 3.333  74 4440
+
+{% endhighlight %}
+
+
+ Run local R functions distributed using `spark.lapply`
+
+# spark.lapply
+Similar to `lapply` in native R, `spark.lapply` runs a function over a list of 
elements and distributes the computations with Spark.
+Applies a function in a manner that is similar to `doParallel` or `lapply` to 
elements of a list. The results of all the computations
+should fit in a single machine. If that is not the case they can do something 
like `df <- createDataFrame(list)` and then use
+`dapply`
+
+{% highlight r %}
+
+# Perform distributed training of multiple models with spark.lapply. Here, we 
pass
+# a read-only list of arguments which specifies family the generalized linear 
model should be.
+families <- c("gaussian", "poisson")
+train <- function(family) {
+  model <- glm(Sepal.Length ~ Sepal.W

spark git commit: [SPARK-16003] SerializationDebugger runs into infinite loop

2016-06-22 Thread davies

Repository: spark
Updated Branches:
  refs/heads/master 472d611a7 -> 6f915c9ec


[SPARK-16003] SerializationDebugger runs into infinite loop

## What changes were proposed in this pull request?

This fixes SerializationDebugger to not recurse forever when `writeReplace` 
returns an object of the same class, which is the case for at least the 
`SQLMetrics` class.

See also the OpenJDK unit tests on the behavior of recursive `writeReplace()`:
https://github.com/openjdk-mirror/jdk7u-jdk/blob/f4d80957e89a19a29bb9f9807d2a28351ed7f7df/test/java/io/Serializable/nestedReplace/NestedReplace.java

cc davies cloud-fan

## How was this patch tested?

Unit tests for SerializationDebugger.

Author: Eric Liang 

Closes #13814 from ericl/spark-16003.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f915c9e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f915c9e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f915c9e

Branch: refs/heads/master
Commit: 6f915c9ec24003877d1ef675a59145699780a2ff
Parents: 472d611
Author: Eric Liang 
Authored: Wed Jun 22 12:12:34 2016 -0700
Committer: Davies Liu 
Committed: Wed Jun 22 12:12:34 2016 -0700

--
 .../spark/serializer/SerializationDebugger.scala   |  9 -
 .../spark/serializer/SerializationDebuggerSuite.scala  | 13 -
 2 files changed, 16 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6f915c9e/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala 
b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
index c04b483..5e7a98c 100644
--- 
a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
+++ 
b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
@@ -155,7 +155,7 @@ private[spark] object SerializationDebugger extends Logging 
{
 
   // If the object has been replaced using writeReplace(),
   // then call visit() on it again to test its type again.
-  if (!finalObj.eq(o)) {
+  if (finalObj.getClass != o.getClass) {
 return visit(finalObj, s"writeReplace data (class: 
${finalObj.getClass.getName})" :: stack)
   }
 
@@ -265,11 +265,10 @@ private[spark] object SerializationDebugger extends 
Logging {
 if (!desc.hasWriteReplaceMethod) {
   (o, desc)
 } else {
-  // write place
   val replaced = desc.invokeWriteReplace(o)
-  // `writeReplace` may return the same object.
-  if (replaced eq o) {
-(o, desc)
+  // `writeReplace` recursion stops when the returned object has the same 
class.
+  if (replaced.getClass == o.getClass) {
+(replaced, desc)
   } else {
 findObjectAndDescriptor(replaced)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/6f915c9e/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
 
b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
index f019b1e..912a516 100644
--- 
a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
@@ -126,7 +126,11 @@ class SerializationDebuggerSuite extends SparkFunSuite 
with BeforeAndAfterEach {
 assert(find(new SerializableClassWithWriteReplace(new 
SerializableClass1)).isEmpty)
   }
 
-test("object containing writeObject() and not serializable field") {
+  test("no infinite loop with writeReplace() which returns class of its own 
type") {
+assert(find(new SerializableClassWithRecursiveWriteReplace).isEmpty)
+  }
+
+  test("object containing writeObject() and not serializable field") {
 val s = find(new SerializableClassWithWriteObject(new NotSerializable))
 assert(s.size === 3)
 assert(s(0).contains("NotSerializable"))
@@ -229,6 +233,13 @@ class SerializableClassWithWriteReplace(@(transient 
@param) replacementFieldObje
 }
 
 
+class SerializableClassWithRecursiveWriteReplace extends Serializable {
+  private def writeReplace(): Object = {
+new SerializableClassWithRecursiveWriteReplace
+  }
+}
+
+
 class ExternalizableClass(objectField: Object) extends java.io.Externalizable {
   val serializableObjectField = new SerializableClass1
 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16003] SerializationDebugger runs into infinite loop

2016-06-22 Thread davies

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 520828c90 -> e043c02d0


[SPARK-16003] SerializationDebugger runs into infinite loop

## What changes were proposed in this pull request?

This fixes SerializationDebugger to not recurse forever when `writeReplace` 
returns an object of the same class, which is the case for at least the 
`SQLMetrics` class.

See also the OpenJDK unit tests on the behavior of recursive `writeReplace()`:
https://github.com/openjdk-mirror/jdk7u-jdk/blob/f4d80957e89a19a29bb9f9807d2a28351ed7f7df/test/java/io/Serializable/nestedReplace/NestedReplace.java

cc davies cloud-fan

## How was this patch tested?

Unit tests for SerializationDebugger.

Author: Eric Liang 

Closes #13814 from ericl/spark-16003.

(cherry picked from commit 6f915c9ec24003877d1ef675a59145699780a2ff)
Signed-off-by: Davies Liu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e043c02d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e043c02d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e043c02d

Branch: refs/heads/branch-2.0
Commit: e043c02d039809be149622a4d7562f332cfa25aa
Parents: 520828c
Author: Eric Liang 
Authored: Wed Jun 22 12:12:34 2016 -0700
Committer: Davies Liu 
Committed: Wed Jun 22 12:12:44 2016 -0700

--
 .../spark/serializer/SerializationDebugger.scala   |  9 -
 .../spark/serializer/SerializationDebuggerSuite.scala  | 13 -
 2 files changed, 16 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e043c02d/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
--
diff --git 
a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala 
b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
index c04b483..5e7a98c 100644
--- 
a/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
+++ 
b/core/src/main/scala/org/apache/spark/serializer/SerializationDebugger.scala
@@ -155,7 +155,7 @@ private[spark] object SerializationDebugger extends Logging 
{
 
   // If the object has been replaced using writeReplace(),
   // then call visit() on it again to test its type again.
-  if (!finalObj.eq(o)) {
+  if (finalObj.getClass != o.getClass) {
 return visit(finalObj, s"writeReplace data (class: 
${finalObj.getClass.getName})" :: stack)
   }
 
@@ -265,11 +265,10 @@ private[spark] object SerializationDebugger extends 
Logging {
 if (!desc.hasWriteReplaceMethod) {
   (o, desc)
 } else {
-  // write place
   val replaced = desc.invokeWriteReplace(o)
-  // `writeReplace` may return the same object.
-  if (replaced eq o) {
-(o, desc)
+  // `writeReplace` recursion stops when the returned object has the same 
class.
+  if (replaced.getClass == o.getClass) {
+(replaced, desc)
   } else {
 findObjectAndDescriptor(replaced)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/e043c02d/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
 
b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
index f019b1e..912a516 100644
--- 
a/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/serializer/SerializationDebuggerSuite.scala
@@ -126,7 +126,11 @@ class SerializationDebuggerSuite extends SparkFunSuite 
with BeforeAndAfterEach {
 assert(find(new SerializableClassWithWriteReplace(new 
SerializableClass1)).isEmpty)
   }
 
-test("object containing writeObject() and not serializable field") {
+  test("no infinite loop with writeReplace() which returns class of its own 
type") {
+assert(find(new SerializableClassWithRecursiveWriteReplace).isEmpty)
+  }
+
+  test("object containing writeObject() and not serializable field") {
 val s = find(new SerializableClassWithWriteObject(new NotSerializable))
 assert(s.size === 3)
 assert(s(0).contains("NotSerializable"))
@@ -229,6 +233,13 @@ class SerializableClassWithWriteReplace(@(transient 
@param) replacementFieldObje
 }
 
 
+class SerializableClassWithRecursiveWriteReplace extends Serializable {
+  private def writeReplace(): Object = {
+new SerializableClassWithRecursiveWriteReplace
+  }
+}
+
+
 class ExternalizableClass(objectField: Object) extends java.io.Externalizable {
   val serializableObjectField = new SerializableClass1
 


-
To unsubscribe, e-mail: commits-unsubscr

spark git commit: [SPARK-15956][SQL] Revert "[] When unwrapping ORC avoid pattern matching…

2016-06-22 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/master c2cebdb7d -> 472d611a7


[SPARK-15956][SQL] Revert "[] When unwrapping ORC avoid pattern matchingâ¦

This reverts commit 0a9c02759515c41de37db6381750bc3a316c860c. It breaks the 
2.10 build, I'll fix this in a different PR.

Author: Herman van Hovell 

Closes #13853 from hvanhovell/SPARK-15956-revert.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/472d611a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/472d611a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/472d611a

Branch: refs/heads/master
Commit: 472d611a70da02d95e36da754435a3ac562f8b24
Parents: c2cebdb
Author: Herman van Hovell 
Authored: Wed Jun 22 11:36:32 2016 -0700
Committer: Herman van Hovell 
Committed: Wed Jun 22 11:36:32 2016 -0700

--
 .../apache/spark/sql/hive/HiveInspectors.scala  | 428 ++-
 .../org/apache/spark/sql/hive/TableReader.scala |   3 +-
 .../hive/execution/ScriptTransformation.scala   |   6 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala|  21 +-
 .../spark/sql/hive/HiveInspectorSuite.scala |   6 -
 5 files changed, 150 insertions(+), 314 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/472d611a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 1aadc8b..585befe 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -239,6 +239,145 @@ private[hive] trait HiveInspectors {
   }
 
   /**
+   * Converts hive types to native catalyst types.
+   * @param data the data in Hive type
+   * @param oi   the ObjectInspector associated with the Hive Type
+   * @return convert the data into catalyst type
+   * TODO return the function of (data => Any) instead for performance 
consideration
+   *
+   * Strictly follows the following order in unwrapping (constant OI has the 
higher priority):
+   *  Constant Null object inspector =>
+   *return null
+   *  Constant object inspector =>
+   *extract the value from constant object inspector
+   *  Check whether the `data` is null =>
+   *return null if true
+   *  If object inspector prefers writable =>
+   *extract writable from `data` and then get the catalyst type from the 
writable
+   *  Extract the java object directly from the object inspector
+   *
+   *  NOTICE: the complex data type requires recursive unwrapping.
+   */
+  def unwrap(data: Any, oi: ObjectInspector): Any = oi match {
+case coi: ConstantObjectInspector if coi.getWritableConstantValue == null 
=> null
+case poi: WritableConstantStringObjectInspector =>
+  UTF8String.fromString(poi.getWritableConstantValue.toString)
+case poi: WritableConstantHiveVarcharObjectInspector =>
+  
UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue)
+case poi: WritableConstantHiveCharObjectInspector =>
+  UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue)
+case poi: WritableConstantHiveDecimalObjectInspector =>
+  HiveShim.toCatalystDecimal(
+PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
+poi.getWritableConstantValue.getHiveDecimal)
+case poi: WritableConstantTimestampObjectInspector =>
+  val t = poi.getWritableConstantValue
+  t.getSeconds * 100L + t.getNanos / 1000L
+case poi: WritableConstantIntObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantDoubleObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantBooleanObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantLongObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantFloatObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantShortObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantByteObjectInspector =>
+  poi.getWritableConstantValue.get()
+case poi: WritableConstantBinaryObjectInspector =>
+  val writable = poi.getWritableConstantValue
+  val temp = new Array[Byte](writable.getLength)
+  System.arraycopy(writable.getBytes, 0, temp, 0, temp.length)
+  temp
+case poi: WritableConstantDateObjectInspector =>
+  DateTimeUtils.fromJavaDate(poi.getWritableConstantValue.get())
+case mi: StandardConstantMapObjectInspector =>
+  // take the value from the map inspector object, rather than the input

spark git commit: [SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and cleaning case uses external variable instead of the passed parameter

2016-06-22 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 76d0ef34e -> 520828c90


[SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and 
cleaning case uses external variable instead of the passed parameter

## What changes were proposed in this pull request?

In `ReceiverSuite.scala`, in the test case "write ahead log - generating and 
cleaning", the inner method `getCurrentLogFiles` uses external variable 
`logDirectory1` instead of the passed parameter `logDirectory`. This PR fixes 
this by using the passed method argument instead of variable from the outer 
scope.

## How was this patch tested?

The unit test was re-run and the output logs were checked for the correct paths 
used.

tdas

Author: Ahmed Mahran 

Closes #13825 from ahmed-mahran/b-receiver-suite-wal-gen-cln.

(cherry picked from commit c2cebdb7ddff3d041d548fe1cd8de4efb31b294f)
Signed-off-by: Shixiong Zhu 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/520828c9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/520828c9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/520828c9

Branch: refs/heads/branch-2.0
Commit: 520828c90d25acf733ffa70fe269dcfe93b56a31
Parents: 76d0ef3
Author: Ahmed Mahran 
Authored: Wed Jun 22 10:39:24 2016 -0700
Committer: Shixiong Zhu 
Committed: Wed Jun 22 10:39:38 2016 -0700

--
 .../src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/520828c9/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
--
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala 
b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
index 917232c..1b1e21f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
@@ -215,7 +215,7 @@ class ReceiverSuite extends TestSuiteBase with Timeouts 
with Serializable {
 def getCurrentLogFiles(logDirectory: File): Seq[String] = {
   try {
 if (logDirectory.exists()) {
-  logDirectory1.listFiles().filter { _.getName.startsWith("log") }.map 
{ _.toString }
+  logDirectory.listFiles().filter { _.getName.startsWith("log") }.map 
{ _.toString }
 } else {
   Seq.empty
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and cleaning case uses external variable instead of the passed parameter

2016-06-22 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/master 0a9c02759 -> c2cebdb7d


[SPARK-16120][STREAMING] getCurrentLogFiles in ReceiverSuite WAL generating and 
cleaning case uses external variable instead of the passed parameter

## What changes were proposed in this pull request?

In `ReceiverSuite.scala`, in the test case "write ahead log - generating and 
cleaning", the inner method `getCurrentLogFiles` uses external variable 
`logDirectory1` instead of the passed parameter `logDirectory`. This PR fixes 
this by using the passed method argument instead of variable from the outer 
scope.

## How was this patch tested?

The unit test was re-run and the output logs were checked for the correct paths 
used.

tdas

Author: Ahmed Mahran 

Closes #13825 from ahmed-mahran/b-receiver-suite-wal-gen-cln.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c2cebdb7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c2cebdb7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c2cebdb7

Branch: refs/heads/master
Commit: c2cebdb7ddff3d041d548fe1cd8de4efb31b294f
Parents: 0a9c027
Author: Ahmed Mahran 
Authored: Wed Jun 22 10:39:24 2016 -0700
Committer: Shixiong Zhu 
Committed: Wed Jun 22 10:39:24 2016 -0700

--
 .../src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c2cebdb7/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
--
diff --git 
a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala 
b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
index 917232c..1b1e21f 100644
--- a/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
+++ b/streaming/src/test/scala/org/apache/spark/streaming/ReceiverSuite.scala
@@ -215,7 +215,7 @@ class ReceiverSuite extends TestSuiteBase with Timeouts 
with Serializable {
 def getCurrentLogFiles(logDirectory: File): Seq[String] = {
   try {
 if (logDirectory.exists()) {
-  logDirectory1.listFiles().filter { _.getName.startsWith("log") }.map 
{ _.toString }
+  logDirectory.listFiles().filter { _.getName.startsWith("log") }.map 
{ _.toString }
 } else {
   Seq.empty
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime

2016-06-22 Thread hvanhovell

Repository: spark
Updated Branches:
  refs/heads/master 6a6010f00 -> 0a9c02759


[SPARK-15956][SQL] When unwrapping ORC avoid pattern matching at runtime

## What changes were proposed in this pull request?

Extend the returning of unwrapper functions from primitive types to all types.

## How was this patch tested?

The patch should pass all unit tests. Reading ORC files with non-primitive 
types with this change reduced the read time by ~15%.

===

The github diff is very noisy. Attaching the screenshots below for improved 
readability:

![screen shot 2016-06-14 at 5 33 16 
pm](https://cloud.githubusercontent.com/assets/1514239/16064580/4d6f7a98-3257-11e6-9172-65e4baff948b.png)

![screen shot 2016-06-14 at 5 33 28 
pm](https://cloud.githubusercontent.com/assets/1514239/16064587/5ae6c244-3257-11e6-8460-69eee70de219.png)

Author: Brian Cho 

Closes #13676 from dafrista/improve-orc-master.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a9c0275
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a9c0275
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a9c0275

Branch: refs/heads/master
Commit: 0a9c02759515c41de37db6381750bc3a316c860c
Parents: 6a6010f
Author: Brian Cho 
Authored: Wed Jun 22 10:38:42 2016 -0700
Committer: Herman van Hovell 
Committed: Wed Jun 22 10:38:42 2016 -0700

--
 .../apache/spark/sql/hive/HiveInspectors.scala  | 428 +--
 .../org/apache/spark/sql/hive/TableReader.scala |   3 +-
 .../hive/execution/ScriptTransformation.scala   |   6 +-
 .../org/apache/spark/sql/hive/hiveUDFs.scala|  21 +-
 .../spark/sql/hive/HiveInspectorSuite.scala |   6 +
 5 files changed, 314 insertions(+), 150 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0a9c0275/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
index 585befe..1aadc8b 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveInspectors.scala
@@ -239,145 +239,6 @@ private[hive] trait HiveInspectors {
   }
 
   /**
-   * Converts hive types to native catalyst types.
-   * @param data the data in Hive type
-   * @param oi   the ObjectInspector associated with the Hive Type
-   * @return convert the data into catalyst type
-   * TODO return the function of (data => Any) instead for performance 
consideration
-   *
-   * Strictly follows the following order in unwrapping (constant OI has the 
higher priority):
-   *  Constant Null object inspector =>
-   *return null
-   *  Constant object inspector =>
-   *extract the value from constant object inspector
-   *  Check whether the `data` is null =>
-   *return null if true
-   *  If object inspector prefers writable =>
-   *extract writable from `data` and then get the catalyst type from the 
writable
-   *  Extract the java object directly from the object inspector
-   *
-   *  NOTICE: the complex data type requires recursive unwrapping.
-   */
-  def unwrap(data: Any, oi: ObjectInspector): Any = oi match {
-case coi: ConstantObjectInspector if coi.getWritableConstantValue == null 
=> null
-case poi: WritableConstantStringObjectInspector =>
-  UTF8String.fromString(poi.getWritableConstantValue.toString)
-case poi: WritableConstantHiveVarcharObjectInspector =>
-  
UTF8String.fromString(poi.getWritableConstantValue.getHiveVarchar.getValue)
-case poi: WritableConstantHiveCharObjectInspector =>
-  UTF8String.fromString(poi.getWritableConstantValue.getHiveChar.getValue)
-case poi: WritableConstantHiveDecimalObjectInspector =>
-  HiveShim.toCatalystDecimal(
-PrimitiveObjectInspectorFactory.javaHiveDecimalObjectInspector,
-poi.getWritableConstantValue.getHiveDecimal)
-case poi: WritableConstantTimestampObjectInspector =>
-  val t = poi.getWritableConstantValue
-  t.getSeconds * 100L + t.getNanos / 1000L
-case poi: WritableConstantIntObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantDoubleObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantBooleanObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantLongObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantFloatObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantShortObjectInspector =>
-  poi.getWritableConstantValue.get()
-case poi: WritableConstantByteObjectInspector =>
-

spark git commit: [MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master 18faa588c -> 6a6010f00


[MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi

## What changes were proposed in this pull request?

`DefaultParamsReadable/Writable` are not user-facing. Only developers who 
implement `Transformer/Estimator` would use it. So this PR changes the 
annotation to `DeveloperApi`.

Author: Xiangrui Meng 

Closes #13828 from mengxr/default-readable-should-be-developer-api.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6a6010f0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6a6010f0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6a6010f0

Branch: refs/heads/master
Commit: 6a6010f0015542dc2753b2cb12fdd1204db63ea6
Parents: 18faa58
Author: Xiangrui Meng 
Authored: Wed Jun 22 10:06:43 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 10:06:43 2016 -0700

--
 .../scala/org/apache/spark/ml/util/ReadWrite.scala | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/6a6010f0/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala 
b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 1582a73..4413fef 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -26,7 +26,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel}
@@ -161,7 +161,7 @@ trait MLWritable {
 }
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * Helper trait for making simple [[Params]] types writable.  If a [[Params]] 
class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this 
trait will provide
@@ -171,8 +171,7 @@ trait MLWritable {
  *
  * @see  [[DefaultParamsReadable]], the counterpart to this trait
  */
-@Experimental
-@Since("2.0.0")
+@DeveloperApi
 trait DefaultParamsWritable extends MLWritable { self: Params =>
 
   override def write: MLWriter = new DefaultParamsWriter(this)
@@ -230,7 +229,7 @@ trait MLReadable[T] {
 
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * Helper trait for making simple [[Params]] types readable.  If a [[Params]] 
class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this 
trait will provide
@@ -239,11 +238,9 @@ trait MLReadable[T] {
  * [[org.apache.spark.sql.Dataset]].
  *
  * @tparam T ML instance type
- *
  * @see  [[DefaultParamsWritable]], the counterpart to this trait
  */
-@Experimental
-@Since("2.0.0")
+@DeveloperApi
 trait DefaultParamsReadable[T] extends MLReadable[T] {
 
   override def read: MLReader[T] = new DefaultParamsReader[T]


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 0cde3ad6d -> 76d0ef34e


[MINOR][MLLIB] DefaultParamsReadable/Writable should be DeveloperApi

## What changes were proposed in this pull request?

`DefaultParamsReadable/Writable` are not user-facing. Only developers who 
implement `Transformer/Estimator` would use it. So this PR changes the 
annotation to `DeveloperApi`.

Author: Xiangrui Meng 

Closes #13828 from mengxr/default-readable-should-be-developer-api.

(cherry picked from commit 6a6010f0015542dc2753b2cb12fdd1204db63ea6)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/76d0ef34
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/76d0ef34
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/76d0ef34

Branch: refs/heads/branch-2.0
Commit: 76d0ef34e4a5b91b883141f839adc493205fa429
Parents: 0cde3ad
Author: Xiangrui Meng 
Authored: Wed Jun 22 10:06:43 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 10:06:49 2016 -0700

--
 .../scala/org/apache/spark/ml/util/ReadWrite.scala | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/76d0ef34/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala 
b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
index 1582a73..4413fef 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/util/ReadWrite.scala
@@ -26,7 +26,7 @@ import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
-import org.apache.spark.annotation.{Experimental, Since}
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
 import org.apache.spark.internal.Logging
 import org.apache.spark.ml._
 import org.apache.spark.ml.classification.{OneVsRest, OneVsRestModel}
@@ -161,7 +161,7 @@ trait MLWritable {
 }
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * Helper trait for making simple [[Params]] types writable.  If a [[Params]] 
class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this 
trait will provide
@@ -171,8 +171,7 @@ trait MLWritable {
  *
  * @see  [[DefaultParamsReadable]], the counterpart to this trait
  */
-@Experimental
-@Since("2.0.0")
+@DeveloperApi
 trait DefaultParamsWritable extends MLWritable { self: Params =>
 
   override def write: MLWriter = new DefaultParamsWriter(this)
@@ -230,7 +229,7 @@ trait MLReadable[T] {
 
 
 /**
- * :: Experimental ::
+ * :: DeveloperApi ::
  *
  * Helper trait for making simple [[Params]] types readable.  If a [[Params]] 
class stores
  * all data as [[org.apache.spark.ml.param.Param]] values, then extending this 
trait will provide
@@ -239,11 +238,9 @@ trait MLReadable[T] {
  * [[org.apache.spark.sql.Dataset]].
  *
  * @tparam T ML instance type
- *
  * @see  [[DefaultParamsWritable]], the counterpart to this trait
  */
-@Experimental
-@Since("2.0.0")
+@DeveloperApi
 trait DefaultParamsReadable[T] extends MLReadable[T] {
 
   override def read: MLReader[T] = new DefaultParamsReader[T]


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 1cfdd25fd -> 0cde3ad6d


[SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg

[SPARK-14615](https://issues.apache.org/jira/browse/SPARK-14615) and #12627 
changed `spark.ml` pipelines to use the new `ml.linalg` classes for 
`Vector`/`Matrix`. Some `Since` annotations for public methods/vals have not 
been updated accordingly to be `2.0.0`. This PR updates them.

## How was this patch tested?

Existing unit tests.

Author: Nick Pentreath 

Closes #13840 from MLnick/SPARK-16127-ml-linalg-since.

(cherry picked from commit 18faa588ca11190890d2eb569d7497fbb25eee5c)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0cde3ad6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0cde3ad6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0cde3ad6

Branch: refs/heads/branch-2.0
Commit: 0cde3ad6d8ac822b73f42b8158ba09f7be00a2c9
Parents: 1cfdd25
Author: Nick Pentreath 
Authored: Wed Jun 22 10:05:25 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 10:05:31 2016 -0700

--
 .../spark/ml/classification/LogisticRegression.scala|  2 +-
 .../classification/MultilayerPerceptronClassifier.scala |  2 +-
 .../org/apache/spark/ml/classification/NaiveBayes.scala |  4 ++--
 .../scala/org/apache/spark/ml/clustering/KMeans.scala   |  2 +-
 .../main/scala/org/apache/spark/ml/clustering/LDA.scala |  4 ++--
 .../apache/spark/ml/feature/ElementwiseProduct.scala|  6 +++---
 .../scala/org/apache/spark/ml/feature/Normalizer.scala  | 12 ++--
 .../apache/spark/ml/feature/PolynomialExpansion.scala   | 12 ++--
 .../scala/org/apache/spark/ml/feature/Word2Vec.scala|  2 +-
 .../spark/ml/regression/AFTSurvivalRegression.scala |  6 +++---
 .../apache/spark/ml/regression/IsotonicRegression.scala |  4 ++--
 .../apache/spark/ml/regression/LinearRegression.scala   |  6 +++---
 python/pyspark/ml/classification.py |  8 
 python/pyspark/ml/regression.py |  8 ++--
 14 files changed, 41 insertions(+), 37 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0cde3ad6/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2fa8fbc..be69d46 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -482,7 +482,7 @@ object LogisticRegression extends 
DefaultParamsReadable[LogisticRegression] {
 @Experimental
 class LogisticRegressionModel private[spark] (
 @Since("1.4.0") override val uid: String,
-@Since("1.6.0") val coefficients: Vector,
+@Since("2.0.0") val coefficients: Vector,
 @Since("1.3.0") val intercept: Double)
   extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
   with LogisticRegressionParams with MLWritable {

http://git-wip-us.apache.org/repos/asf/spark/blob/0cde3ad6/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 7005421..76ef32a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -296,7 +296,7 @@ object MultilayerPerceptronClassifier
 class MultilayerPerceptronClassificationModel private[ml] (
 @Since("1.5.0") override val uid: String,
 @Since("1.5.0") val layers: Array[Int],
-@Since("1.5.0") val weights: Vector)
+@Since("2.0.0") val weights: Vector)
   extends PredictionModel[Vector, MultilayerPerceptronClassificationModel]
   with Serializable with MLWritable {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/0cde3ad6/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index a9d4930..7c34031 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classif

spark git commit: [SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master ea3a12b01 -> 18faa588c


[SPARK-16127][ML][PYPSARK] Audit @Since annotations related to ml.linalg

[SPARK-14615](https://issues.apache.org/jira/browse/SPARK-14615) and #12627 
changed `spark.ml` pipelines to use the new `ml.linalg` classes for 
`Vector`/`Matrix`. Some `Since` annotations for public methods/vals have not 
been updated accordingly to be `2.0.0`. This PR updates them.

## How was this patch tested?

Existing unit tests.

Author: Nick Pentreath 

Closes #13840 from MLnick/SPARK-16127-ml-linalg-since.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/18faa588
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/18faa588
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/18faa588

Branch: refs/heads/master
Commit: 18faa588ca11190890d2eb569d7497fbb25eee5c
Parents: ea3a12b
Author: Nick Pentreath 
Authored: Wed Jun 22 10:05:25 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 10:05:25 2016 -0700

--
 .../spark/ml/classification/LogisticRegression.scala|  2 +-
 .../classification/MultilayerPerceptronClassifier.scala |  2 +-
 .../org/apache/spark/ml/classification/NaiveBayes.scala |  4 ++--
 .../scala/org/apache/spark/ml/clustering/KMeans.scala   |  2 +-
 .../main/scala/org/apache/spark/ml/clustering/LDA.scala |  4 ++--
 .../apache/spark/ml/feature/ElementwiseProduct.scala|  6 +++---
 .../scala/org/apache/spark/ml/feature/Normalizer.scala  | 12 ++--
 .../apache/spark/ml/feature/PolynomialExpansion.scala   | 12 ++--
 .../scala/org/apache/spark/ml/feature/Word2Vec.scala|  2 +-
 .../spark/ml/regression/AFTSurvivalRegression.scala |  6 +++---
 .../apache/spark/ml/regression/IsotonicRegression.scala |  4 ++--
 .../apache/spark/ml/regression/LinearRegression.scala   |  6 +++---
 python/pyspark/ml/classification.py |  8 
 python/pyspark/ml/regression.py |  8 ++--
 14 files changed, 41 insertions(+), 37 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/18faa588/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 2fa8fbc..be69d46 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -482,7 +482,7 @@ object LogisticRegression extends 
DefaultParamsReadable[LogisticRegression] {
 @Experimental
 class LogisticRegressionModel private[spark] (
 @Since("1.4.0") override val uid: String,
-@Since("1.6.0") val coefficients: Vector,
+@Since("2.0.0") val coefficients: Vector,
 @Since("1.3.0") val intercept: Double)
   extends ProbabilisticClassificationModel[Vector, LogisticRegressionModel]
   with LogisticRegressionParams with MLWritable {

http://git-wip-us.apache.org/repos/asf/spark/blob/18faa588/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
index 7005421..76ef32a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala
@@ -296,7 +296,7 @@ object MultilayerPerceptronClassifier
 class MultilayerPerceptronClassificationModel private[ml] (
 @Since("1.5.0") override val uid: String,
 @Since("1.5.0") val layers: Array[Int],
-@Since("1.5.0") val weights: Vector)
+@Since("2.0.0") val weights: Vector)
   extends PredictionModel[Vector, MultilayerPerceptronClassificationModel]
   with Serializable with MLWritable {
 

http://git-wip-us.apache.org/repos/asf/spark/blob/18faa588/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index a9d4930..7c34031 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -130,8 +130,8 @@ object NaiveBayes extends DefaultParamsReadable[NaiveBayes] 
{

spark git commit: [SPARK-16107][R] group glm methods in documentation

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 503eb882c -> 1cfdd25fd


[SPARK-16107][R] group glm methods in documentation

## What changes were proposed in this pull request?

This groups GLM methods (spark.glm, summary, print, predict and write.ml) in 
the documentation. The example code was updated.

## How was this patch tested?

N/A

(If this patch involves UI changes, please attach a screenshot; otherwise, 
remove this)

![screen shot 2016-06-21 at 2 31 37 
pm](https://cloud.githubusercontent.com/assets/15318264/16247077/f6eafc04-37bc-11e6-89a8-7898ff3e4078.png)
![screen shot 2016-06-21 at 2 31 45 
pm](https://cloud.githubusercontent.com/assets/15318264/16247078/f6eb1c16-37bc-11e6-940a-2b595b10617c.png)

Author: Junyang Qian 
Author: Junyang Qian 

Closes #13820 from junyangq/SPARK-16107.

(cherry picked from commit ea3a12b0147821960f8dabdc58d726f07f1f0e52)
Signed-off-by: Xiangrui Meng 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1cfdd25f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1cfdd25f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1cfdd25f

Branch: refs/heads/branch-2.0
Commit: 1cfdd25fdb87012187b1e01f9c5ac4b6218dc840
Parents: 503eb88
Author: Junyang Qian 
Authored: Wed Jun 22 09:13:08 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 09:13:15 2016 -0700

--
 R/pkg/R/mllib.R | 80 +++-
 1 file changed, 36 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1cfdd25f/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b83b3b3..dbff1b9 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -53,9 +53,10 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = 
"jobj"))
 #' @note KMeansModel since 2.0.0
 setClass("KMeansModel", representation(jobj = "jobj"))
 
-#' Fits a generalized linear model
+#' Generalized Linear Models
 #'
-#' Fits a generalized linear model against a Spark DataFrame.
+#' Fits generalized linear model against a Spark DataFrame. Users can print, 
make predictions on the
+#' produced model and save the model to the input path.
 #'
 #' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
@@ -66,8 +67,9 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #'   
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #' @param tol Positive convergence tolerance of iterations.
 #' @param maxIter Integer giving the maximal number of IRLS iterations.
-#' @return a fitted generalized linear model
+#' @return \code{spark.glm} returns a fitted generalized linear model
 #' @rdname spark.glm
+#' @name spark.glm
 #' @export
 #' @examples
 #' \dontrun{
@@ -76,8 +78,21 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' df <- createDataFrame(iris)
 #' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
 #' summary(model)
+#'
+#' # fitted values on training data
+#' fitted <- predict(model, df)
+#' head(select(fitted, "Sepal_Length", "prediction"))
+#'
+#' # save fitted model to input path
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#'
+#' # can also read back the saved model and print
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
 #' }
 #' @note spark.glm since 2.0.0
+#' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
   function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) 
{
 if (is.character(family)) {
@@ -99,10 +114,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 return(new("GeneralizedLinearRegressionModel", jobj = jobj))
   })
 
-#' Fits a generalized linear model (R-compliant).
+#' Generalized Linear Models (R-compliant)
 #'
 #' Fits a generalized linear model, similarly to R's glm().
-#'
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
 #'operators are supported, including '~', '.', ':', '+', and 
'-'.
 #' @param data SparkDataFrame for training.
@@ -112,7 +126,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #'   
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #' @param epsilon Positive convergence tolerance of iterations.
 #' @param maxit Integer giving the maximal number of IRLS iterations.
-#' @return a fitted generalized linear model
+#' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
 #' @examples
@@ -124,24 +138,21 @@ setMethod("spark.g

spark git commit: [SPARK-16107][R] group glm methods in documentation

2016-06-22 Thread meng

Repository: spark
Updated Branches:
  refs/heads/master cf1995a97 -> ea3a12b01


[SPARK-16107][R] group glm methods in documentation

## What changes were proposed in this pull request?

This groups GLM methods (spark.glm, summary, print, predict and write.ml) in 
the documentation. The example code was updated.

## How was this patch tested?

N/A

(If this patch involves UI changes, please attach a screenshot; otherwise, 
remove this)

![screen shot 2016-06-21 at 2 31 37 
pm](https://cloud.githubusercontent.com/assets/15318264/16247077/f6eafc04-37bc-11e6-89a8-7898ff3e4078.png)
![screen shot 2016-06-21 at 2 31 45 
pm](https://cloud.githubusercontent.com/assets/15318264/16247078/f6eb1c16-37bc-11e6-940a-2b595b10617c.png)

Author: Junyang Qian 
Author: Junyang Qian 

Closes #13820 from junyangq/SPARK-16107.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ea3a12b0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ea3a12b0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ea3a12b0

Branch: refs/heads/master
Commit: ea3a12b0147821960f8dabdc58d726f07f1f0e52
Parents: cf1995a
Author: Junyang Qian 
Authored: Wed Jun 22 09:13:08 2016 -0700
Committer: Xiangrui Meng 
Committed: Wed Jun 22 09:13:08 2016 -0700

--
 R/pkg/R/mllib.R | 80 +++-
 1 file changed, 36 insertions(+), 44 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ea3a12b0/R/pkg/R/mllib.R
--
diff --git a/R/pkg/R/mllib.R b/R/pkg/R/mllib.R
index b83b3b3..dbff1b9 100644
--- a/R/pkg/R/mllib.R
+++ b/R/pkg/R/mllib.R
@@ -53,9 +53,10 @@ setClass("AFTSurvivalRegressionModel", representation(jobj = 
"jobj"))
 #' @note KMeansModel since 2.0.0
 setClass("KMeansModel", representation(jobj = "jobj"))
 
-#' Fits a generalized linear model
+#' Generalized Linear Models
 #'
-#' Fits a generalized linear model against a Spark DataFrame.
+#' Fits generalized linear model against a Spark DataFrame. Users can print, 
make predictions on the
+#' produced model and save the model to the input path.
 #'
 #' @param data SparkDataFrame for training.
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
@@ -66,8 +67,9 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #'   
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #' @param tol Positive convergence tolerance of iterations.
 #' @param maxIter Integer giving the maximal number of IRLS iterations.
-#' @return a fitted generalized linear model
+#' @return \code{spark.glm} returns a fitted generalized linear model
 #' @rdname spark.glm
+#' @name spark.glm
 #' @export
 #' @examples
 #' \dontrun{
@@ -76,8 +78,21 @@ setClass("KMeansModel", representation(jobj = "jobj"))
 #' df <- createDataFrame(iris)
 #' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
 #' summary(model)
+#'
+#' # fitted values on training data
+#' fitted <- predict(model, df)
+#' head(select(fitted, "Sepal_Length", "prediction"))
+#'
+#' # save fitted model to input path
+#' path <- "path/to/model"
+#' write.ml(model, path)
+#'
+#' # can also read back the saved model and print
+#' savedModel <- read.ml(path)
+#' summary(savedModel)
 #' }
 #' @note spark.glm since 2.0.0
+#' @seealso \link{glm}, \link{read.ml}
 setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
   function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25) 
{
 if (is.character(family)) {
@@ -99,10 +114,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 return(new("GeneralizedLinearRegressionModel", jobj = jobj))
   })
 
-#' Fits a generalized linear model (R-compliant).
+#' Generalized Linear Models (R-compliant)
 #'
 #' Fits a generalized linear model, similarly to R's glm().
-#'
 #' @param formula A symbolic description of the model to be fitted. Currently 
only a few formula
 #'operators are supported, including '~', '.', ':', '+', and 
'-'.
 #' @param data SparkDataFrame for training.
@@ -112,7 +126,7 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #'   
\url{https://stat.ethz.ch/R-manual/R-devel/library/stats/html/family.html}.
 #' @param epsilon Positive convergence tolerance of iterations.
 #' @param maxit Integer giving the maximal number of IRLS iterations.
-#' @return a fitted generalized linear model
+#' @return \code{glm} returns a fitted generalized linear model.
 #' @rdname glm
 #' @export
 #' @examples
@@ -124,24 +138,21 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", 
formula = "formula"),
 #' summary(model)
 #' }
 #' @note glm since

spark git commit: [SPARK-15783][CORE] Fix Flakiness in BlacklistIntegrationSuite

2016-06-22 Thread irashid

Repository: spark
Updated Branches:
  refs/heads/master 01277d4b2 -> cf1995a97


[SPARK-15783][CORE] Fix Flakiness in BlacklistIntegrationSuite

## What changes were proposed in this pull request?

Three changes here -- first two were causing failures w/ 
BlacklistIntegrationSuite

1. The testing framework didn't include the reviveOffers thread, so the test 
which involved delay scheduling might never submit offers late enough for the 
delay scheduling to kick in.  So added in the periodic revive offers, just like 
the real scheduler.

2. `assertEmptyDataStructures` would occasionally fail, because it appeared 
there was still an active job.  This is because in DAGScheduler, the jobWaiter 
is notified of the job completion before the data structures are cleaned up.  
Most of the time the test code that is waiting on the jobWaiter won't become 
active until after the data structures are cleared, but occasionally the race 
goes the other way, and the assertions fail.

3. `DAGSchedulerSuite` was not stopping all the inner parts it was setting up, 
so each test was leaking a number of threads.  So we stop those parts too.

4. Turns out that `assertMapOutputAvailable` is not terribly useful in this 
framework -- most of the places I was trying to use it suffer from some race.

5. When there is an exception in the backend, try to improve the error msg a 
little bit.  Before the exception was printed to the console, but the test 
would fail w/ a timeout, and the logs wouldn't show anything.

## How was this patch tested?

I ran all the tests in `BlacklistIntegrationSuite` 5k times and everything in 
`DAGSchedulerSuite` 1k times on my laptop.  Also I ran a full jenkins build 
with `BlacklistIntegrationSuite` 500 times and `DAGSchedulerSuite` 50 times, 
see https://github.com/apache/spark/pull/13548.  (I tried more times but 
jenkins timed out.)

To check for more leaked threads, I added some code to dump the list of all 
threads at the end of each test in DAGSchedulerSuite, which is how I discovered 
the mapOutputTracker and eventLoop were leaking threads.  (I removed that code 
from the final pr, just part of the testing.)

And I'll run Jenkins on this a couple of times to do one more check.

Author: Imran Rashid 

Closes #13565 from squito/blacklist_extra_tests.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cf1995a9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cf1995a9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cf1995a9

Branch: refs/heads/master
Commit: cf1995a97645f0b44c997f4fdbba631fd6b91a16
Parents: 01277d4
Author: Imran Rashid 
Authored: Wed Jun 22 08:35:41 2016 -0500
Committer: Imran Rashid 
Committed: Wed Jun 22 08:35:41 2016 -0500

--
 .../apache/spark/scheduler/DAGScheduler.scala   |  4 +-
 .../scheduler/BlacklistIntegrationSuite.scala   | 16 ++---
 .../spark/scheduler/DAGSchedulerSuite.scala |  9 ++-
 .../scheduler/SchedulerIntegrationSuite.scala   | 73 
 4 files changed, 76 insertions(+), 26 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cf1995a9/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
--
diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala 
b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
index d4e0d6d..4eb7c81 100644
--- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
+++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1465,8 +1465,10 @@ class DAGScheduler(
 }
 
 if (ableToCancelStages) {
-  job.listener.jobFailed(error)
+  // SPARK-15783 important to cleanup state first, just for tests where we 
have some asserts
+  // against the state.  Otherwise we have a *little* bit of flakiness in 
the tests.
   cleanupStateForJobAndIndependentStages(job)
+  job.listener.jobFailed(error)
   listenerBus.post(SparkListenerJobEnd(job.jobId, clock.getTimeMillis(), 
JobFailed(error)))
 }
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/cf1995a9/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
--
diff --git 
a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
 
b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
index d8a4b19..8ba2697 100644
--- 
a/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
+++ 
b/core/src/test/scala/org/apache/spark/scheduler/BlacklistIntegrationSuite.scala
@@ -24,6 +24,7 @@ import org.apache.spark._
 class BlacklistIntegrationSuite extends 
SchedulerIntegrationSuite[MultiExecutorMockBackend]{
 
   val badHost =

spark git commit: [SPARK-6005][TESTS] Fix flaky test: o.a.s.streaming.kafka.DirectKafkaStreamSuite.offset recovery

2016-06-22 Thread srowen

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 d98fb19c1 -> 4fdac3c27


[SPARK-6005][TESTS] Fix flaky test: 
o.a.s.streaming.kafka.DirectKafkaStreamSuite.offset recovery

## What changes were proposed in this pull request?

Because this test extracts data from `DStream.generatedRDDs` before stopping, 
it may get data before checkpointing. Then after recovering from the 
checkpoint, `recoveredOffsetRanges` may contain something not in 
`offsetRangesBeforeStop`, which will fail the test. Adding `Thread.sleep(1000)` 
before `ssc.stop()` will reproduce this failure.

This PR just moves the logic of `offsetRangesBeforeStop` (also renamed to 
`offsetRangesAfterStop`) after `ssc.stop()` to fix the flaky test.

## How was this patch tested?

Jenkins unit tests.

Author: Shixiong Zhu 

Closes #12903 from zsxwing/SPARK-6005.

(cherry picked from commit 9533f5390a3ad7ab96a7bea01cdb6aed89503a51)
Signed-off-by: Sean Owen 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4fdac3c2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4fdac3c2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4fdac3c2

Branch: refs/heads/branch-1.6
Commit: 4fdac3c271eccc5db69c45788af15e955752a163
Parents: d98fb19
Author: Shixiong Zhu 
Authored: Tue May 10 13:26:53 2016 -0700
Committer: Sean Owen 
Committed: Wed Jun 22 14:10:50 2016 +0100

--
 .../kafka/DirectKafkaStreamSuite.scala  | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4fdac3c2/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
--
diff --git 
a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
 
b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
index 02225d5..feea0ae 100644
--- 
a/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
+++ 
b/external/kafka/src/test/scala/org/apache/spark/streaming/kafka/DirectKafkaStreamSuite.scala
@@ -280,14 +280,20 @@ class DirectKafkaStreamSuite
   sendDataAndWaitForReceive(i)
 }
 
+ssc.stop()
+
 // Verify that offset ranges were generated
-val offsetRangesBeforeStop = getOffsetRanges(kafkaStream)
-assert(offsetRangesBeforeStop.size >= 1, "No offset ranges generated")
+// Since "offsetRangesAfterStop" will be used to compare with 
"recoveredOffsetRanges", we should
+// collect offset ranges after stopping. Otherwise, because new RDDs keep 
being generated before
+// stopping, we may not be able to get the latest RDDs, then 
"recoveredOffsetRanges" will
+// contain something not in "offsetRangesAfterStop".
+val offsetRangesAfterStop = getOffsetRanges(kafkaStream)
+assert(offsetRangesAfterStop.size >= 1, "No offset ranges generated")
 assert(
-  offsetRangesBeforeStop.head._2.forall { _.fromOffset === 0 },
+  offsetRangesAfterStop.head._2.forall { _.fromOffset === 0 },
   "starting offset not zero"
 )
-ssc.stop()
+
 logInfo("== RESTARTING ")
 
 // Recover context from checkpoints
@@ -297,12 +303,14 @@ class DirectKafkaStreamSuite
 // Verify offset ranges have been recovered
 val recoveredOffsetRanges = getOffsetRanges(recoveredStream)
 assert(recoveredOffsetRanges.size > 0, "No offset ranges recovered")
-val earlierOffsetRangesAsSets = offsetRangesBeforeStop.map { x => (x._1, 
x._2.toSet) }
+val earlierOffsetRangesAsSets = offsetRangesAfterStop.map { x => (x._1, 
x._2.toSet) }
 assert(
   recoveredOffsetRanges.forall { or =>
 earlierOffsetRangesAsSets.contains((or._1, or._2.toSet))
   },
-  "Recovered ranges are not the same as the ones generated"
+  "Recovered ranges are not the same as the ones generated\n" +
+s"recoveredOffsetRanges: $recoveredOffsetRanges\n" +
+s"earlierOffsetRangesAsSets: $earlierOffsetRangesAsSets"
 )
 // Restart context, give more data and verify the total at the end
 // If the total is write that means each records has been received only 
once


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-16097][SQL] Encoders.tuple should handle null object correctly

2016-06-22 Thread lian

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 60bd704b5 -> 503eb882c


[SPARK-16097][SQL] Encoders.tuple should handle null object correctly

## What changes were proposed in this pull request?

Although the top level input object can not be null, but when we use 
`Encoders.tuple` to combine 2 encoders, their input objects are not top level 
anymore and can be null. We should handle this case.

## How was this patch tested?

new test in DatasetSuite

Author: Wenchen Fan 

Closes #13807 from cloud-fan/bug.

(cherry picked from commit 01277d4b259dcf9cad25eece1377162b7a8c946d)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/503eb882
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/503eb882
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/503eb882

Branch: refs/heads/branch-2.0
Commit: 503eb882c14eac9681981199ccf8f699cab23bf0
Parents: 60bd704
Author: Wenchen Fan 
Authored: Wed Jun 22 18:32:14 2016 +0800
Committer: Cheng Lian 
Committed: Wed Jun 22 18:37:36 2016 +0800

--
 .../catalyst/encoders/ExpressionEncoder.scala   | 48 ++--
 .../org/apache/spark/sql/DatasetSuite.scala |  7 +++
 2 files changed, 42 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/503eb882/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 0023ce6..1fac26c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -28,7 +28,7 @@ import 
org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection
 import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, 
Invoke, NewInstance}
 import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts
 import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, 
DeserializeToObject, LocalRelation}
-import org.apache.spark.sql.types.{ObjectType, StructField, StructType}
+import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, 
StructType}
 import org.apache.spark.util.Utils
 
 /**
@@ -110,16 +110,34 @@ object ExpressionEncoder {
 
 val cls = 
Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}")
 
-val serializer = encoders.map {
-  case e if e.flat => e.serializer.head
-  case other => CreateStruct(other.serializer)
-}.zipWithIndex.map { case (expr, index) =>
-  expr.transformUp {
-case BoundReference(0, t, _) =>
-  Invoke(
-BoundReference(0, ObjectType(cls), nullable = true),
-s"_${index + 1}",
-t)
+val serializer = encoders.zipWithIndex.map { case (enc, index) =>
+  val originalInputObject = enc.serializer.head.collect { case b: 
BoundReference => b }.head
+  val newInputObject = Invoke(
+BoundReference(0, ObjectType(cls), nullable = true),
+s"_${index + 1}",
+originalInputObject.dataType)
+
+  val newSerializer = enc.serializer.map(_.transformUp {
+case b: BoundReference if b == originalInputObject => newInputObject
+  })
+
+  if (enc.flat) {
+newSerializer.head
+  } else {
+// For non-flat encoder, the input object is not top level anymore 
after being combined to
+// a tuple encoder, thus it can be null and we should wrap the 
`CreateStruct` with `If` and
+// null check to handle null case correctly.
+// e.g. for Encoder[(Int, String)], the serializer expressions will 
create 2 columns, and is
+// not able to handle the case when the input tuple is null. This is 
not a problem as there
+// is a check to make sure the input object won't be null. However, if 
this encoder is used
+// to create a bigger tuple encoder, the original input object becomes 
a filed of the new
+// input tuple and can be null. So instead of creating a struct 
directly here, we should add
+// a null/None check and return a null struct if the null/None check 
fails.
+val struct = CreateStruct(newSerializer)
+val nullCheck = Or(
+  IsNull(newInputObject),
+  Invoke(Literal.fromObject(None), "equals", BooleanType, 
newInputObject :: Nil))
+If(nullCheck, Literal.create(null, struct.dataType), struct)
   }
 }
 
@@ -203,8 +221,12 @@ case class ExpressionEncoder[T](
   // (intermediate value is not an attribute). We assume that all

spark git commit: [SPARK-16097][SQL] Encoders.tuple should handle null object correctly

2016-06-22 Thread lian

Repository: spark
Updated Branches:
  refs/heads/master 39ad53f7f -> 01277d4b2


[SPARK-16097][SQL] Encoders.tuple should handle null object correctly

## What changes were proposed in this pull request?

Although the top level input object can not be null, but when we use 
`Encoders.tuple` to combine 2 encoders, their input objects are not top level 
anymore and can be null. We should handle this case.

## How was this patch tested?

new test in DatasetSuite

Author: Wenchen Fan 

Closes #13807 from cloud-fan/bug.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/01277d4b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/01277d4b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/01277d4b

Branch: refs/heads/master
Commit: 01277d4b259dcf9cad25eece1377162b7a8c946d
Parents: 39ad53f
Author: Wenchen Fan 
Authored: Wed Jun 22 18:32:14 2016 +0800
Committer: Cheng Lian 
Committed: Wed Jun 22 18:32:14 2016 +0800

--
 .../catalyst/encoders/ExpressionEncoder.scala   | 48 ++--
 .../org/apache/spark/sql/DatasetSuite.scala |  7 +++
 2 files changed, 42 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/01277d4b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
index 0023ce6..1fac26c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/encoders/ExpressionEncoder.scala
@@ -28,7 +28,7 @@ import 
org.apache.spark.sql.catalyst.expressions.codegen.{GenerateSafeProjection
 import org.apache.spark.sql.catalyst.expressions.objects.{AssertNotNull, 
Invoke, NewInstance}
 import org.apache.spark.sql.catalyst.optimizer.SimplifyCasts
 import org.apache.spark.sql.catalyst.plans.logical.{CatalystSerde, 
DeserializeToObject, LocalRelation}
-import org.apache.spark.sql.types.{ObjectType, StructField, StructType}
+import org.apache.spark.sql.types.{BooleanType, ObjectType, StructField, 
StructType}
 import org.apache.spark.util.Utils
 
 /**
@@ -110,16 +110,34 @@ object ExpressionEncoder {
 
 val cls = 
Utils.getContextOrSparkClassLoader.loadClass(s"scala.Tuple${encoders.size}")
 
-val serializer = encoders.map {
-  case e if e.flat => e.serializer.head
-  case other => CreateStruct(other.serializer)
-}.zipWithIndex.map { case (expr, index) =>
-  expr.transformUp {
-case BoundReference(0, t, _) =>
-  Invoke(
-BoundReference(0, ObjectType(cls), nullable = true),
-s"_${index + 1}",
-t)
+val serializer = encoders.zipWithIndex.map { case (enc, index) =>
+  val originalInputObject = enc.serializer.head.collect { case b: 
BoundReference => b }.head
+  val newInputObject = Invoke(
+BoundReference(0, ObjectType(cls), nullable = true),
+s"_${index + 1}",
+originalInputObject.dataType)
+
+  val newSerializer = enc.serializer.map(_.transformUp {
+case b: BoundReference if b == originalInputObject => newInputObject
+  })
+
+  if (enc.flat) {
+newSerializer.head
+  } else {
+// For non-flat encoder, the input object is not top level anymore 
after being combined to
+// a tuple encoder, thus it can be null and we should wrap the 
`CreateStruct` with `If` and
+// null check to handle null case correctly.
+// e.g. for Encoder[(Int, String)], the serializer expressions will 
create 2 columns, and is
+// not able to handle the case when the input tuple is null. This is 
not a problem as there
+// is a check to make sure the input object won't be null. However, if 
this encoder is used
+// to create a bigger tuple encoder, the original input object becomes 
a filed of the new
+// input tuple and can be null. So instead of creating a struct 
directly here, we should add
+// a null/None check and return a null struct if the null/None check 
fails.
+val struct = CreateStruct(newSerializer)
+val nullCheck = Or(
+  IsNull(newInputObject),
+  Invoke(Literal.fromObject(None), "equals", BooleanType, 
newInputObject :: Nil))
+If(nullCheck, Literal.create(null, struct.dataType), struct)
   }
 }
 
@@ -203,8 +221,12 @@ case class ExpressionEncoder[T](
   // (intermediate value is not an attribute). We assume that all serializer 
expressions use a same
   // `BoundReference` to refer to the object, and throw exception if

spark git commit: [SPARK-16121] ListingFileCatalog does not list in parallel anymore

2016-06-22 Thread lian

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 838143a2a -> 60bd704b5


[SPARK-16121] ListingFileCatalog does not list in parallel anymore

## What changes were proposed in this pull request?
Seems the fix of SPARK-14959 breaks the parallel partitioning discovery. This 
PR fixes the problem

## How was this patch tested?
Tested manually. (This PR also adds a proper test for SPARK-14959)

Author: Yin Huai 

Closes #13830 from yhuai/SPARK-16121.

(cherry picked from commit 39ad53f7ffddae5ba0ff0a76089ba671b14c44c8)
Signed-off-by: Cheng Lian 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/60bd704b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/60bd704b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/60bd704b

Branch: refs/heads/branch-2.0
Commit: 60bd704b541c4d1991922ffd3dd5b47de9bd5821
Parents: 838143a
Author: Yin Huai 
Authored: Wed Jun 22 18:07:07 2016 +0800
Committer: Cheng Lian 
Committed: Wed Jun 22 18:07:27 2016 +0800

--
 .../datasources/ListingFileCatalog.scala| 58 ++--
 .../datasources/fileSourceInterfaces.scala  |  7 ++-
 .../datasources/FileSourceStrategySuite.scala   | 45 ++-
 3 files changed, 101 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/60bd704b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index f713fde..675e755 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources
 import scala.collection.mutable
 import scala.util.Try
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path}
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.SparkSession
@@ -73,21 +73,67 @@ class ListingFileCatalog(
 cachedPartitionSpec = null
   }
 
-  protected def listLeafFiles(paths: Seq[Path]): 
mutable.LinkedHashSet[FileStatus] = {
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do 
parallel
+   * listing whenever there is a path having more files than the parallel 
partition discovery
+   * discovery threshold.
+   */
+  protected[spark] def listLeafFiles(paths: Seq[Path]): 
mutable.LinkedHashSet[FileStatus] = {
 if (paths.length >= 
sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
   HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
 } else {
+  // Right now, the number of paths is less than the value of
+  // parallelPartitionDiscoveryThreshold. So, we will list file statues at 
the driver.
+  // If there is any child that has more files than the threshold, we will 
use parallel
+  // listing.
+
   // Dummy jobconf to get to the pathFilter defined in configuration
   val jobConf = new JobConf(hadoopConf, this.getClass)
   val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
+
   val statuses: Seq[FileStatus] = paths.flatMap { path =>
 val fs = path.getFileSystem(hadoopConf)
 logTrace(s"Listing $path on driver")
-Try {
-  HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), 
pathFilter)
-}.getOrElse(Array.empty[FileStatus])
+
+val childStatuses = {
+  // TODO: We need to avoid of using Try at here.
+  val stats = 
Try(fs.listStatus(path)).getOrElse(Array.empty[FileStatus])
+  if (pathFilter != null) stats.filter(f => 
pathFilter.accept(f.getPath)) else stats
+}
+
+childStatuses.map {
+  case f: LocatedFileStatus => f
+
+  // NOTE:
+  //
+  // - Although S3/S3A/S3N file system can be quite slow for remote 
file metadata
+  //   operations, calling `getFileBlockLocations` does no harm here 
since these file system
+  //   implementations don't actually issue RPC for this method.
+  //
+  // - Here we are calling `getFileBlockLocations` in a sequential 
manner, but it should not
+  //   be a big deal since we always use to `listLeafFilesInParallel` 
when the number of
+  //   paths exceeds threshold.
+  case f =>
+if (f.isDirectory ) {
+  // If f is a directory, we do not need to call 
getFileBlockLocations (SPARK-14959).
+  f
+

spark git commit: [SPARK-16121] ListingFileCatalog does not list in parallel anymore

2016-06-22 Thread lian

Repository: spark
Updated Branches:
  refs/heads/master d281b0baf -> 39ad53f7f


[SPARK-16121] ListingFileCatalog does not list in parallel anymore

## What changes were proposed in this pull request?
Seems the fix of SPARK-14959 breaks the parallel partitioning discovery. This 
PR fixes the problem

## How was this patch tested?
Tested manually. (This PR also adds a proper test for SPARK-14959)

Author: Yin Huai 

Closes #13830 from yhuai/SPARK-16121.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/39ad53f7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/39ad53f7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/39ad53f7

Branch: refs/heads/master
Commit: 39ad53f7ffddae5ba0ff0a76089ba671b14c44c8
Parents: d281b0b
Author: Yin Huai 
Authored: Wed Jun 22 18:07:07 2016 +0800
Committer: Cheng Lian 
Committed: Wed Jun 22 18:07:07 2016 +0800

--
 .../datasources/ListingFileCatalog.scala| 58 ++--
 .../datasources/fileSourceInterfaces.scala  |  7 ++-
 .../datasources/FileSourceStrategySuite.scala   | 45 ++-
 3 files changed, 101 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/39ad53f7/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
index f713fde..675e755 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/ListingFileCatalog.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.datasources
 import scala.collection.mutable
 import scala.util.Try
 
-import org.apache.hadoop.fs.{FileStatus, Path}
+import org.apache.hadoop.fs.{FileStatus, LocatedFileStatus, Path}
 import org.apache.hadoop.mapred.{FileInputFormat, JobConf}
 
 import org.apache.spark.sql.SparkSession
@@ -73,21 +73,67 @@ class ListingFileCatalog(
 cachedPartitionSpec = null
   }
 
-  protected def listLeafFiles(paths: Seq[Path]): 
mutable.LinkedHashSet[FileStatus] = {
+  /**
+   * List leaf files of given paths. This method will submit a Spark job to do 
parallel
+   * listing whenever there is a path having more files than the parallel 
partition discovery
+   * discovery threshold.
+   */
+  protected[spark] def listLeafFiles(paths: Seq[Path]): 
mutable.LinkedHashSet[FileStatus] = {
 if (paths.length >= 
sparkSession.sessionState.conf.parallelPartitionDiscoveryThreshold) {
   HadoopFsRelation.listLeafFilesInParallel(paths, hadoopConf, sparkSession)
 } else {
+  // Right now, the number of paths is less than the value of
+  // parallelPartitionDiscoveryThreshold. So, we will list file statues at 
the driver.
+  // If there is any child that has more files than the threshold, we will 
use parallel
+  // listing.
+
   // Dummy jobconf to get to the pathFilter defined in configuration
   val jobConf = new JobConf(hadoopConf, this.getClass)
   val pathFilter = FileInputFormat.getInputPathFilter(jobConf)
+
   val statuses: Seq[FileStatus] = paths.flatMap { path =>
 val fs = path.getFileSystem(hadoopConf)
 logTrace(s"Listing $path on driver")
-Try {
-  HadoopFsRelation.listLeafFiles(fs, fs.getFileStatus(path), 
pathFilter)
-}.getOrElse(Array.empty[FileStatus])
+
+val childStatuses = {
+  // TODO: We need to avoid of using Try at here.
+  val stats = 
Try(fs.listStatus(path)).getOrElse(Array.empty[FileStatus])
+  if (pathFilter != null) stats.filter(f => 
pathFilter.accept(f.getPath)) else stats
+}
+
+childStatuses.map {
+  case f: LocatedFileStatus => f
+
+  // NOTE:
+  //
+  // - Although S3/S3A/S3N file system can be quite slow for remote 
file metadata
+  //   operations, calling `getFileBlockLocations` does no harm here 
since these file system
+  //   implementations don't actually issue RPC for this method.
+  //
+  // - Here we are calling `getFileBlockLocations` in a sequential 
manner, but it should not
+  //   be a big deal since we always use to `listLeafFilesInParallel` 
when the number of
+  //   paths exceeds threshold.
+  case f =>
+if (f.isDirectory ) {
+  // If f is a directory, we do not need to call 
getFileBlockLocations (SPARK-14959).
+  f
+} else {
+  HadoopFsRelation.createLocatedFileStatus(f, 
fs.getFileBlockLocations(f

spark git commit: [SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs

2016-06-22 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e7a489c7f -> 838143a2a


[SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs

## What changes were proposed in this pull request?

Mark ml.classification algorithms as experimental to match Scala algorithms, 
update PyDoc for for thresholds on `LogisticRegression` to have same level of 
info as Scala, and enable mathjax for PyDoc.

## How was this patch tested?

Built docs locally & PySpark SQL tests

Author: Holden Karau 

Closes #12938 from holdenk/SPARK-15162-SPARK-15164-update-some-pydocs.

(cherry picked from commit d281b0bafe6aa23085d4d2b68f0ce321f1978b50)
Signed-off-by: Nick Pentreath 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/838143a2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/838143a2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/838143a2

Branch: refs/heads/branch-2.0
Commit: 838143a2a02192a9ebc955b8060a6520b62d9644
Parents: e7a489c
Author: Holden Karau 
Authored: Wed Jun 22 11:54:49 2016 +0200
Committer: Nick Pentreath 
Committed: Wed Jun 22 11:55:10 2016 +0200

--
 .../ml/classification/LogisticRegression.scala  |  5 ++-
 python/docs/conf.py |  1 +
 python/pyspark/ml/classification.py | 38 ++--
 3 files changed, 39 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/838143a2/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a7ba39e..2fa8fbc 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -72,10 +72,9 @@ private[classification] trait LogisticRegressionParams 
extends ProbabilisticClas
   /**
* Get threshold for binary classification.
*
-   * If [[threshold]] is set, returns that value.
-   * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary 
classification),
+   * If [[thresholds]] is set with length 2 (i.e., binary classification),
* this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / 
thresholds(1))}}}.
-   * Otherwise, returns [[threshold]] default value.
+   * Otherwise, returns [[threshold]] if set, or its default value if unset.
*
* @group getParam
* @throws IllegalArgumentException if [[thresholds]] is set to an array of 
length other than 2.

http://git-wip-us.apache.org/repos/asf/spark/blob/838143a2/python/docs/conf.py
--
diff --git a/python/docs/conf.py b/python/docs/conf.py
index d35bf73..50fb317 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -32,6 +32,7 @@ extensions = [
 'sphinx.ext.autodoc',
 'sphinx.ext.viewcode',
 'epytext',
+'sphinx.ext.mathjax',
 ]
 
 # Add any paths that contain templates here, relative to this directory.

http://git-wip-us.apache.org/repos/asf/spark/blob/838143a2/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index e86c27e..d6d713c 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -49,6 +49,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
  HasElasticNetParam, HasFitIntercept, 
HasStandardization, HasThresholds,
  HasWeightCol, JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 Logistic regression.
 Currently, this class only supports binary classification.
 
@@ -96,7 +98,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 
 threshold = Param(Params._dummy(), "threshold",
   "Threshold in binary classification prediction, in range 
[0, 1]." +
-  " If threshold and thresholds are both set, they must 
match.",
+  " If threshold and thresholds are both set, they must 
match." +
+  "e.g. if threshold is p, then thresholds must be equal 
to [1-p, p].",
   typeConverter=TypeConverters.toFloat)
 
 @keyword_only
@@ -154,7 +157,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 @since("1.4.0")
 def getThreshold(self):
 """
-Gets the value of threshold or its default value.
+Get threshold for binar

spark git commit: [SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs

2016-06-22 Thread mlnick

Repository: spark
Updated Branches:
  refs/heads/master 0e3ce7533 -> d281b0baf


[SPARK-15162][SPARK-15164][PYSPARK][DOCS][ML] update some pydocs

## What changes were proposed in this pull request?

Mark ml.classification algorithms as experimental to match Scala algorithms, 
update PyDoc for for thresholds on `LogisticRegression` to have same level of 
info as Scala, and enable mathjax for PyDoc.

## How was this patch tested?

Built docs locally & PySpark SQL tests

Author: Holden Karau 

Closes #12938 from holdenk/SPARK-15162-SPARK-15164-update-some-pydocs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d281b0ba
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d281b0ba
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d281b0ba

Branch: refs/heads/master
Commit: d281b0bafe6aa23085d4d2b68f0ce321f1978b50
Parents: 0e3ce75
Author: Holden Karau 
Authored: Wed Jun 22 11:54:49 2016 +0200
Committer: Nick Pentreath 
Committed: Wed Jun 22 11:54:49 2016 +0200

--
 .../ml/classification/LogisticRegression.scala  |  5 ++-
 python/docs/conf.py |  1 +
 python/pyspark/ml/classification.py | 38 ++--
 3 files changed, 39 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d281b0ba/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index a7ba39e..2fa8fbc 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -72,10 +72,9 @@ private[classification] trait LogisticRegressionParams 
extends ProbabilisticClas
   /**
* Get threshold for binary classification.
*
-   * If [[threshold]] is set, returns that value.
-   * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary 
classification),
+   * If [[thresholds]] is set with length 2 (i.e., binary classification),
* this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / 
thresholds(1))}}}.
-   * Otherwise, returns [[threshold]] default value.
+   * Otherwise, returns [[threshold]] if set, or its default value if unset.
*
* @group getParam
* @throws IllegalArgumentException if [[thresholds]] is set to an array of 
length other than 2.

http://git-wip-us.apache.org/repos/asf/spark/blob/d281b0ba/python/docs/conf.py
--
diff --git a/python/docs/conf.py b/python/docs/conf.py
index d35bf73..50fb317 100644
--- a/python/docs/conf.py
+++ b/python/docs/conf.py
@@ -32,6 +32,7 @@ extensions = [
 'sphinx.ext.autodoc',
 'sphinx.ext.viewcode',
 'epytext',
+'sphinx.ext.mathjax',
 ]
 
 # Add any paths that contain templates here, relative to this directory.

http://git-wip-us.apache.org/repos/asf/spark/blob/d281b0ba/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index e86c27e..d6d713c 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -49,6 +49,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
  HasElasticNetParam, HasFitIntercept, 
HasStandardization, HasThresholds,
  HasWeightCol, JavaMLWritable, JavaMLReadable):
 """
+.. note:: Experimental
+
 Logistic regression.
 Currently, this class only supports binary classification.
 
@@ -96,7 +98,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 
 threshold = Param(Params._dummy(), "threshold",
   "Threshold in binary classification prediction, in range 
[0, 1]." +
-  " If threshold and thresholds are both set, they must 
match.",
+  " If threshold and thresholds are both set, they must 
match." +
+  "e.g. if threshold is p, then thresholds must be equal 
to [1-p, p].",
   typeConverter=TypeConverters.toFloat)
 
 @keyword_only
@@ -154,7 +157,12 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 @since("1.4.0")
 def getThreshold(self):
 """
-Gets the value of threshold or its default value.
+Get threshold for binary classification.
+
+If :py:attr:`thresholds` is set with length 2 (i.e., binary 
classification),
+

39 matches

Mail list logo