date:20170620

spark git commit: [SPARK-10655][SQL] Adding additional data type mappings to jdbc DB2dialect.

2017-06-20 Thread lixiao

Repository: spark
Updated Branches:
  refs/heads/master b6b108826 -> 9ce714dca


[SPARK-10655][SQL] Adding additional data type mappings to jdbc DB2dialect.

This patch adds DB2 specific data type mappings for decfloat, real, xml , and 
timestamp with time zone (DB2Z specific type)  types on read and for byte, 
short data types  on write to the to jdbc data source DB2 dialect. Default 
mapping does not work for these types when reading/writing from DB2 database.

Added docker test, and a JDBC unit test case.

Author: sureshthalamati 

Closes #9162 from sureshthalamati/db2dialect_enhancements-spark-10655.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9ce714dc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9ce714dc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9ce714dc

Branch: refs/heads/master
Commit: 9ce714dca272315ef7f50d791563f22e8d5922ac
Parents: b6b1088
Author: sureshthalamati 
Authored: Tue Jun 20 22:35:42 2017 -0700
Committer: gatorsmile 
Committed: Tue Jun 20 22:35:42 2017 -0700

--
 .../spark/sql/jdbc/DB2IntegrationSuite.scala| 47 +++-
 .../org/apache/spark/sql/jdbc/DB2Dialect.scala  | 21 -
 .../org/apache/spark/sql/jdbc/JDBCSuite.scala   |  9 
 3 files changed, 66 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9ce714dc/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala
--
diff --git 
a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala
 
b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala
index 3da34b1..f5930bc28 100644
--- 
a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala
+++ 
b/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala
@@ -21,10 +21,13 @@ import java.math.BigDecimal
 import java.sql.{Connection, Date, Timestamp}
 import java.util.Properties
 
-import org.scalatest._
+import org.scalatest.Ignore
 
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.types.{BooleanType, ByteType, ShortType, 
StructType}
 import org.apache.spark.tags.DockerTest
 
+
 @DockerTest
 @Ignore // AMPLab Jenkins needs to be updated before shared memory works on 
docker
 class DB2IntegrationSuite extends DockerJDBCIntegrationSuite {
@@ -47,19 +50,22 @@ class DB2IntegrationSuite extends 
DockerJDBCIntegrationSuite {
 conn.prepareStatement("INSERT INTO tbl VALUES (17,'dave')").executeUpdate()
 
 conn.prepareStatement("CREATE TABLE numbers ( small SMALLINT, med INTEGER, 
big BIGINT, "
-  + "deci DECIMAL(31,20), flt FLOAT, dbl DOUBLE)").executeUpdate()
+  + "deci DECIMAL(31,20), flt FLOAT, dbl DOUBLE, real REAL, "
+  + "decflt DECFLOAT, decflt16 DECFLOAT(16), decflt34 
DECFLOAT(34))").executeUpdate()
 conn.prepareStatement("INSERT INTO numbers VALUES (17, 7, 
922337203685477580, "
-  + "123456745.567890123450, 42.75, 5.4E-70)").executeUpdate()
+  + "123456745.567890123450, 42.75, 5.4E-70, "
+  + "3.4028234663852886e+38, 4.2999, DECFLOAT('9.999E19', 16), 
"
+  + "DECFLOAT('1234567891234567.123456789123456789', 34))").executeUpdate()
 
 conn.prepareStatement("CREATE TABLE dates (d DATE, t TIME, ts TIMESTAMP 
)").executeUpdate()
 conn.prepareStatement("INSERT INTO dates VALUES ('1991-11-09', '13:31:24', 
"
   + "'2009-02-13 23:31:30')").executeUpdate()
 
 // TODO: Test locale conversion for strings.
-conn.prepareStatement("CREATE TABLE strings (a CHAR(10), b VARCHAR(10), c 
CLOB, d BLOB)")
-  .executeUpdate()
-conn.prepareStatement("INSERT INTO strings VALUES ('the', 'quick', 
'brown', BLOB('fox'))")
+conn.prepareStatement("CREATE TABLE strings (a CHAR(10), b VARCHAR(10), c 
CLOB, d BLOB, e XML)")
   .executeUpdate()
+conn.prepareStatement("INSERT INTO strings VALUES ('the', 'quick', 
'brown', BLOB('fox'),"
+  + "'Kathy')").executeUpdate()
   }
 
   test("Basic test") {
@@ -77,13 +83,17 @@ class DB2IntegrationSuite extends 
DockerJDBCIntegrationSuite {
 val rows = df.collect()
 assert(rows.length == 1)
 val types = rows(0).toSeq.map(x => x.getClass.toString)
-assert(types.length == 6)
+assert(types.length == 10)
 assert(types(0).equals("class java.lang.Integer"))
 assert(types(1).equals("class java.lang.Integer"))
 assert(types(2).equals("class java.lang.Long"))
 assert(types(3).equals("class java.math.BigDecimal"))
 assert(types(4).equals("class java.lang.Double"))

spark-website git commit: Add a note for searching PySpark and SparkR version changes in release-process.md

2017-06-20 Thread srowen

Repository: spark-website
Updated Branches:
  refs/heads/asf-site 2c375dc4f -> af0b9fba2


Add a note for searching PySpark and SparkR version changes in 
release-process.md


Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/af0b9fba
Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/af0b9fba
Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/af0b9fba

Branch: refs/heads/asf-site
Commit: af0b9fba2636959e563e5d1bbca2e2212f489d10
Parents: 2c375dc
Author: hyukjinkwon 
Authored: Wed Jun 21 01:38:13 2017 +0900
Committer: Sean Owen 
Committed: Tue Jun 20 21:24:26 2017 +0100

--
 release-process.md| 2 ++
 site/release-process.html | 2 ++
 2 files changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark-website/blob/af0b9fba/release-process.md
--
diff --git a/release-process.md b/release-process.md
index a98dc80..a5a609e 100644
--- a/release-process.md
+++ b/release-process.md
@@ -46,6 +46,8 @@ Maven when cutting the release. Note that there are a few 
exceptions that should
 - **Spark REPLs**. Look for the Spark ASCII art in `SparkILoopInit.scala` for 
the Scala shell 
 and in `shell.py` for the Python REPL.
 - **Docs**. Search for VERSION in `docs/_config.yml`
+- **PySpark**. Search for `__version__` in `python/pyspark/version.py`
+- **SparkR**. Search for `Version` in `R/pkg/DESCRIPTION`
 
 Finally, update `CHANGES.txt` with this script in the Spark repository. 
`CHANGES.txt` captures 
 all the patches that have made it into this release candidate since the last 
release.

http://git-wip-us.apache.org/repos/asf/spark-website/blob/af0b9fba/site/release-process.html
--
diff --git a/site/release-process.html b/site/release-process.html
index cae0646..2a4bcdc 100644
--- a/site/release-process.html
+++ b/site/release-process.html
@@ -245,6 +245,8 @@ Maven when cutting the release. Note that there are a few 
exceptions that should
   Spark REPLs. Look for the Spark ASCII art in 
SparkILoopInit.scala for the Scala shell 
 and in shell.py for the Python REPL.
   Docs. Search for VERSION in 
docs/_config.yml
+  PySpark. Search for __version__ in 
python/pyspark/version.py
+  SparkR. Search for Version in 
R/pkg/DESCRIPTION
 
 
 Finally, update CHANGES.txt with this script in the Spark 
repository. CHANGES.txt captures 


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark-website git commit: Add .DS_Store in .gitignore

2017-06-20 Thread srowen

Repository: spark-website
Updated Branches:
  refs/heads/asf-site fda1364c8 -> 2c375dc4f


Add .DS_Store in .gitignore


Project: http://git-wip-us.apache.org/repos/asf/spark-website/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark-website/commit/2c375dc4
Tree: http://git-wip-us.apache.org/repos/asf/spark-website/tree/2c375dc4
Diff: http://git-wip-us.apache.org/repos/asf/spark-website/diff/2c375dc4

Branch: refs/heads/asf-site
Commit: 2c375dc4fde2bc619df25f36fb27908cdc4a7041
Parents: fda1364
Author: hyukjinkwon 
Authored: Wed Jun 21 01:41:49 2017 +0900
Committer: hyukjinkwon 
Committed: Wed Jun 21 01:41:49 2017 +0900

--
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark-website/blob/2c375dc4/.gitignore
--
diff --git a/.gitignore b/.gitignore
index 62c8935..f32e31a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-.idea/
\ No newline at end of file
+.idea/
+.DS_Store


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-21123][DOCS][STRUCTURED STREAMING] Options for file stream source are in a wrong table - version to fix 2.1

2017-06-20 Thread zsxwing

Repository: spark
Updated Branches:
  refs/heads/branch-2.1 7799f35dd -> 8923bac1e


[SPARK-21123][DOCS][STRUCTURED STREAMING] Options for file stream source are in 
a wrong table - version to fix 2.1

## What changes were proposed in this pull request?

The description for several options of File Source for structured streaming 
appeared in the File Sink description instead.

This commit continues on PR #18342 and targets the fixes for the documentation 
of version spark version 2.1

## How was this patch tested?

Built the documentation by SKIP_API=1 jekyll build and visually inspected the 
structured streaming programming guide.

zsxwing This is the PR to fix version 2.1 as discussed in PR #18342

Author: assafmendelson 

Closes #18363 from assafmendelson/spark-21123-for-spark2.1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8923bac1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8923bac1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8923bac1

Branch: refs/heads/branch-2.1
Commit: 8923bac1e895e57ce2d9ef6aea31e13e390be6da
Parents: 7799f35
Author: assafmendelson 
Authored: Tue Jun 20 13:07:51 2017 -0700
Committer: Shixiong Zhu 
Committed: Tue Jun 20 13:07:51 2017 -0700

--
 docs/structured-streaming-programming-guide.md | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8923bac1/docs/structured-streaming-programming-guide.md
--
diff --git a/docs/structured-streaming-programming-guide.md 
b/docs/structured-streaming-programming-guide.md
index da5c234..e536f5d 100644
--- a/docs/structured-streaming-programming-guide.md
+++ b/docs/structured-streaming-programming-guide.md
@@ -449,6 +449,10 @@ Here are the details of all the sources in Spark.
 File source
 
 path: path to the input directory, and common to all file 
formats.
+
+maxFilesPerTrigger: maximum number of new files to be 
considered in every trigger (default: no max)
+
+latestFirst: whether to processs the latest new files 
first, useful when there is a large backlog of files(default: false)
 
 For file-format-specific options, see the related methods in 
DataStreamReader
 (Scala/Java/Python).
@@ -1076,9 +1080,6 @@ Here are the details of all the sinks in Spark.
 Append
 
 path: path to the output directory, must be specified.
-maxFilesPerTrigger: maximum number of new files to be 
considered in every trigger (default: no max)
-
-latestFirst: whether to processs the latest new files 
first, useful when there is a large backlog of files(default: false)
 
 For file-format-specific options, see the related methods in 
DataFrameWriter
 (Scala/Java/Python).


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-21103][SQL] QueryPlanConstraints should be part of LogicalPlan

2017-06-20 Thread rxin

Repository: spark
Updated Branches:
  refs/heads/master e862dc904 -> b6b108826


[SPARK-21103][SQL] QueryPlanConstraints should be part of LogicalPlan

## What changes were proposed in this pull request?
QueryPlanConstraints should be part of LogicalPlan, rather than QueryPlan, 
since the constraint framework is only used for query plan rewriting and not 
for physical planning.

## How was this patch tested?
Should be covered by existing tests, since it is a simple refactoring.

Author: Reynold Xin 

Closes #18310 from rxin/SPARK-21103.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b6b10882
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b6b10882
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b6b10882

Branch: refs/heads/master
Commit: b6b108826a5dd5c889a70180365f9320452557fc
Parents: e862dc9
Author: Reynold Xin 
Authored: Tue Jun 20 11:34:22 2017 -0700
Committer: Reynold Xin 
Committed: Tue Jun 20 11:34:22 2017 -0700

--
 .../spark/sql/catalyst/plans/QueryPlan.scala|   5 +-
 .../catalyst/plans/QueryPlanConstraints.scala   | 195 --
 .../catalyst/plans/logical/LogicalPlan.scala|   2 +-
 .../plans/logical/QueryPlanConstraints.scala| 196 +++
 4 files changed, 198 insertions(+), 200 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b6b10882/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 9130b14..1f6d05b 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -22,10 +22,7 @@ import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.{DataType, StructType}
 
-abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
-  extends TreeNode[PlanType]
-  with QueryPlanConstraints[PlanType] {
-
+abstract class QueryPlan[PlanType <: QueryPlan[PlanType]] extends 
TreeNode[PlanType] {
   self: PlanType =>
 
   def conf: SQLConf = SQLConf.get

http://git-wip-us.apache.org/repos/asf/spark/blob/b6b10882/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlanConstraints.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlanConstraints.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlanConstraints.scala
deleted file mode 100644
index b08a009..000
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlanConstraints.scala
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.sql.catalyst.plans
-
-import org.apache.spark.sql.catalyst.expressions._
-
-
-trait QueryPlanConstraints[PlanType <: QueryPlan[PlanType]] { self: 
QueryPlan[PlanType] =>
-
-  /**
-   * An [[ExpressionSet]] that contains invariants about the rows output by 
this operator. For
-   * example, if this set contains the expression `a = 2` then that expression 
is guaranteed to
-   * evaluate to `true` for all rows produced.
-   */
-  lazy val constraints: ExpressionSet = {
-if (conf.constraintPropagationEnabled) {
-  ExpressionSet(
-validConstraints
-  .union(inferAdditionalConstraints(validConstraints))
-  .union(constructIsNotNullConstraints(validConstraints))
-  .filter { c =>
-c.references.nonEmpty && c.references.subsetOf(outputSet) && 
c.deterministic
-  }
-  )
-} else {
-  ExpressionSet(Set.empty)
-}
-  }
-
-  /**
-   * This method can be overridden by any child

[1/2] spark git commit: Preparing Spark release v2.2.0-rc5

2017-06-20 Thread pwendell

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 b8b80f6de -> e88349873


Preparing Spark release v2.2.0-rc5


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/62e442e7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/62e442e7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/62e442e7

Branch: refs/heads/branch-2.2
Commit: 62e442e73a2fa663892d2edaff5f7d72d7f402ed
Parents: b8b80f6
Author: Patrick Wendell 
Authored: Tue Jun 20 10:56:51 2017 -0700
Committer: Patrick Wendell 
Committed: Tue Jun 20 10:56:51 2017 -0700

--
 R/pkg/DESCRIPTION | 2 +-
 assembly/pom.xml  | 2 +-
 common/network-common/pom.xml | 2 +-
 common/network-shuffle/pom.xml| 2 +-
 common/network-yarn/pom.xml   | 2 +-
 common/sketch/pom.xml | 2 +-
 common/tags/pom.xml   | 2 +-
 common/unsafe/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 docs/_config.yml  | 4 ++--
 examples/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml   | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/kafka-0-10-assembly/pom.xml  | 2 +-
 external/kafka-0-10-sql/pom.xml   | 2 +-
 external/kafka-0-10/pom.xml   | 2 +-
 external/kafka-0-8-assembly/pom.xml   | 2 +-
 external/kafka-0-8/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml | 2 +-
 external/kinesis-asl/pom.xml  | 2 +-
 external/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml| 2 +-
 launcher/pom.xml  | 2 +-
 mllib-local/pom.xml   | 2 +-
 mllib/pom.xml | 2 +-
 pom.xml   | 2 +-
 python/pyspark/version.py | 2 +-
 repl/pom.xml  | 2 +-
 resource-managers/mesos/pom.xml   | 2 +-
 resource-managers/yarn/pom.xml| 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 38 files changed, 39 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/62e442e7/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index cfa49b9..879c1f8 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.2.1
+Version: 2.2.0
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

http://git-wip-us.apache.org/repos/asf/spark/blob/62e442e7/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index da7b0c9..3a7003f 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.1-SNAPSHOT
+2.2.0
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/62e442e7/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 7577253..5e9ffd1 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.1-SNAPSHOT
+2.2.0
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/62e442e7/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index 558864a..c3e10d1 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.1-SNAPSHOT
+2.2.0
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/62e442e7/common/network-yarn/pom.xml
--
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index de66617..e66a8b4 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark

[2/2] spark git commit: Preparing development version 2.2.1-SNAPSHOT

2017-06-20 Thread pwendell

Preparing development version 2.2.1-SNAPSHOT


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8834987
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8834987
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8834987

Branch: refs/heads/branch-2.2
Commit: e88349873b3678045741c82009f36d7fe66d29ee
Parents: 62e442e
Author: Patrick Wendell 
Authored: Tue Jun 20 10:56:55 2017 -0700
Committer: Patrick Wendell 
Committed: Tue Jun 20 10:56:55 2017 -0700

--
 R/pkg/DESCRIPTION | 2 +-
 assembly/pom.xml  | 2 +-
 common/network-common/pom.xml | 2 +-
 common/network-shuffle/pom.xml| 2 +-
 common/network-yarn/pom.xml   | 2 +-
 common/sketch/pom.xml | 2 +-
 common/tags/pom.xml   | 2 +-
 common/unsafe/pom.xml | 2 +-
 core/pom.xml  | 2 +-
 docs/_config.yml  | 4 ++--
 examples/pom.xml  | 2 +-
 external/docker-integration-tests/pom.xml | 2 +-
 external/flume-assembly/pom.xml   | 2 +-
 external/flume-sink/pom.xml   | 2 +-
 external/flume/pom.xml| 2 +-
 external/kafka-0-10-assembly/pom.xml  | 2 +-
 external/kafka-0-10-sql/pom.xml   | 2 +-
 external/kafka-0-10/pom.xml   | 2 +-
 external/kafka-0-8-assembly/pom.xml   | 2 +-
 external/kafka-0-8/pom.xml| 2 +-
 external/kinesis-asl-assembly/pom.xml | 2 +-
 external/kinesis-asl/pom.xml  | 2 +-
 external/spark-ganglia-lgpl/pom.xml   | 2 +-
 graphx/pom.xml| 2 +-
 launcher/pom.xml  | 2 +-
 mllib-local/pom.xml   | 2 +-
 mllib/pom.xml | 2 +-
 pom.xml   | 2 +-
 python/pyspark/version.py | 2 +-
 repl/pom.xml  | 2 +-
 resource-managers/mesos/pom.xml   | 2 +-
 resource-managers/yarn/pom.xml| 2 +-
 sql/catalyst/pom.xml  | 2 +-
 sql/core/pom.xml  | 2 +-
 sql/hive-thriftserver/pom.xml | 2 +-
 sql/hive/pom.xml  | 2 +-
 streaming/pom.xml | 2 +-
 tools/pom.xml | 2 +-
 38 files changed, 39 insertions(+), 39 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e8834987/R/pkg/DESCRIPTION
--
diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION
index 879c1f8..cfa49b9 100644
--- a/R/pkg/DESCRIPTION
+++ b/R/pkg/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: SparkR
 Type: Package
-Version: 2.2.0
+Version: 2.2.1
 Title: R Frontend for Apache Spark
 Description: The SparkR package provides an R Frontend for Apache Spark.
 Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"),

http://git-wip-us.apache.org/repos/asf/spark/blob/e8834987/assembly/pom.xml
--
diff --git a/assembly/pom.xml b/assembly/pom.xml
index 3a7003f..da7b0c9 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -21,7 +21,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.0
+2.2.1-SNAPSHOT
 ../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e8834987/common/network-common/pom.xml
--
diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml
index 5e9ffd1..7577253 100644
--- a/common/network-common/pom.xml
+++ b/common/network-common/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.0
+2.2.1-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e8834987/common/network-shuffle/pom.xml
--
diff --git a/common/network-shuffle/pom.xml b/common/network-shuffle/pom.xml
index c3e10d1..558864a 100644
--- a/common/network-shuffle/pom.xml
+++ b/common/network-shuffle/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.0
+2.2.1-SNAPSHOT
 ../../pom.xml
   
 

http://git-wip-us.apache.org/repos/asf/spark/blob/e8834987/common/network-yarn/pom.xml
--
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
index e66a8b4..de66617 100644
--- a/common/network-yarn/pom.xml
+++ b/common/network-yarn/pom.xml
@@ -22,7 +22,7 @@
   
 org.apache.spark
 spark-parent_2.11
-2.2.0
+2.2.1-SNAPSHOT
 ../../pom.xml

[spark] Git Push Summary

2017-06-20 Thread pwendell

Repository: spark
Updated Tags:  refs/tags/v2.2.0-rc5 [created] 62e442e73

-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-21150][SQL] Persistent view stored in Hive metastore should be case preserving

2017-06-20 Thread lixiao

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 514a7e6f8 -> b8b80f6de


[SPARK-21150][SQL] Persistent view stored in Hive metastore should be case 
preserving

## What changes were proposed in this pull request?

This is a regression in Spark 2.2. In Spark 2.2, we introduced a new way to 
resolve persisted view: https://issues.apache.org/jira/browse/SPARK-18209 , but 
this makes the persisted view non case-preserving because we store the schema 
in hive metastore directly. We should follow data source table and store schema 
in table properties.

## How was this patch tested?

new regression test

Author: Wenchen Fan 

Closes #18360 from cloud-fan/view.

(cherry picked from commit e862dc904963cf7832bafc1d3d0ea9090bbddd81)
Signed-off-by: gatorsmile 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b8b80f6d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b8b80f6d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b8b80f6d

Branch: refs/heads/branch-2.2
Commit: b8b80f6dea86d4e4a648b86e38936d3a82ffc0aa
Parents: 514a7e6
Author: Wenchen Fan 
Authored: Tue Jun 20 09:15:33 2017 -0700
Committer: gatorsmile 
Committed: Tue Jun 20 09:15:41 2017 -0700

--
 .../spark/sql/execution/command/views.scala |  4 +-
 .../spark/sql/execution/SQLViewSuite.scala  | 10 +++
 .../spark/sql/hive/HiveExternalCatalog.scala| 84 ++--
 3 files changed, 56 insertions(+), 42 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b8b80f6d/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index 00f0aca..3518ee5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -159,7 +159,9 @@ case class CreateViewCommand(
 checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent)
 
 // Handles `CREATE OR REPLACE VIEW v0 AS SELECT ...`
-catalog.alterTable(prepareTable(sparkSession, analyzedPlan))
+// Nothing we need to retain from the old view, so just drop and 
create a new one
+catalog.dropTable(viewIdent, ignoreIfNotExists = false, purge = false)
+catalog.createTable(prepareTable(sparkSession, analyzedPlan), 
ignoreIfExists = false)
   } else {
 // Handles `CREATE VIEW v0 AS SELECT ...`. Throws exception when the 
target view already
 // exists.

http://git-wip-us.apache.org/repos/asf/spark/blob/b8b80f6d/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index d32716c..6761f05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -669,4 +669,14 @@ abstract class SQLViewSuite extends QueryTest with 
SQLTestUtils {
 "positive."))
 }
   }
+
+  test("permanent view should be case-preserving") {
+withView("v") {
+  sql("CREATE VIEW v AS SELECT 1 as aBc")
+  assert(spark.table("v").schema.head.name == "aBc")
+
+  sql("CREATE OR REPLACE VIEW v AS SELECT 2 as cBa")
+  assert(spark.table("v").schema.head.name == "cBa")
+}
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/b8b80f6d/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index a03beb7..f2fe227 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -224,39 +224,36 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, 
hadoopConf: Configurat
   throw new TableAlreadyExistsException(db = db, table = table)
 }
 
-if (tableDefinition.tableType == VIEW) {
-  client.createTable(tableDefinition, ignoreIfExists)
+// Ideally we should not create a managed table with location, but Hive 
serde table can
+// specify location for managed table. And in

spark git commit: [SPARK-21150][SQL] Persistent view stored in Hive metastore should be case preserving

2017-06-20 Thread lixiao

Repository: spark
Updated Branches:
  refs/heads/master ef1622899 -> e862dc904


[SPARK-21150][SQL] Persistent view stored in Hive metastore should be case 
preserving

## What changes were proposed in this pull request?

This is a regression in Spark 2.2. In Spark 2.2, we introduced a new way to 
resolve persisted view: https://issues.apache.org/jira/browse/SPARK-18209 , but 
this makes the persisted view non case-preserving because we store the schema 
in hive metastore directly. We should follow data source table and store schema 
in table properties.

## How was this patch tested?

new regression test

Author: Wenchen Fan 

Closes #18360 from cloud-fan/view.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e862dc90
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e862dc90
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e862dc90

Branch: refs/heads/master
Commit: e862dc904963cf7832bafc1d3d0ea9090bbddd81
Parents: ef16228
Author: Wenchen Fan 
Authored: Tue Jun 20 09:15:33 2017 -0700
Committer: gatorsmile 
Committed: Tue Jun 20 09:15:33 2017 -0700

--
 .../spark/sql/execution/command/views.scala |  4 +-
 .../spark/sql/execution/SQLViewSuite.scala  | 10 +++
 .../spark/sql/hive/HiveExternalCatalog.scala| 84 ++--
 3 files changed, 56 insertions(+), 42 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e862dc90/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
--
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
index 1945d68..a6d56ca 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/command/views.scala
@@ -159,7 +159,9 @@ case class CreateViewCommand(
 checkCyclicViewReference(analyzedPlan, Seq(viewIdent), viewIdent)
 
 // Handles `CREATE OR REPLACE VIEW v0 AS SELECT ...`
-catalog.alterTable(prepareTable(sparkSession, analyzedPlan))
+// Nothing we need to retain from the old view, so just drop and 
create a new one
+catalog.dropTable(viewIdent, ignoreIfNotExists = false, purge = false)
+catalog.createTable(prepareTable(sparkSession, analyzedPlan), 
ignoreIfExists = false)
   } else {
 // Handles `CREATE VIEW v0 AS SELECT ...`. Throws exception when the 
target view already
 // exists.

http://git-wip-us.apache.org/repos/asf/spark/blob/e862dc90/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
--
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
index d32716c..6761f05 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/SQLViewSuite.scala
@@ -669,4 +669,14 @@ abstract class SQLViewSuite extends QueryTest with 
SQLTestUtils {
 "positive."))
 }
   }
+
+  test("permanent view should be case-preserving") {
+withView("v") {
+  sql("CREATE VIEW v AS SELECT 1 as aBc")
+  assert(spark.table("v").schema.head.name == "aBc")
+
+  sql("CREATE OR REPLACE VIEW v AS SELECT 2 as cBa")
+  assert(spark.table("v").schema.head.name == "cBa")
+}
+  }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/e862dc90/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
index 1945367..6e7c475 100644
--- 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
+++ 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -224,39 +224,36 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, 
hadoopConf: Configurat
   throw new TableAlreadyExistsException(db = db, table = table)
 }
 
-if (tableDefinition.tableType == VIEW) {
-  client.createTable(tableDefinition, ignoreIfExists)
+// Ideally we should not create a managed table with location, but Hive 
serde table can
+// specify location for managed table. And in 
[[CreateDataSourceTableAsSelectCommand]] we have
+// to create the table directory and write out data before we create this 
table, to avoid
+// exposing

spark git commit: [SPARK-20989][CORE] Fail to start multiple workers on one host if external shuffle service is enabled in standalone mode

2017-06-20 Thread wenchen

Repository: spark
Updated Branches:
  refs/heads/master cc67bd573 -> ef1622899


[SPARK-20989][CORE] Fail to start multiple workers on one host if external 
shuffle service is enabled in standalone mode

## What changes were proposed in this pull request?

In standalone mode, if we enable external shuffle service by setting 
`spark.shuffle.service.enabled` to true, and then we try to start multiple 
workers on one host(by setting `SPARK_WORKER_INSTANCES=3` in spark-env.sh, and 
then run `sbin/start-slaves.sh`), we can only launch one worker on each host 
successfully and the rest of the workers fail to launch.
The reason is the port of external shuffle service if configed by 
`spark.shuffle.service.port`, so currently we could start no more than one 
external shuffle service on each host. In our case, each worker tries to start 
a external shuffle service, and only one of them succeeded doing this.

We should give explicit reason of failure instead of fail silently.

## How was this patch tested?
Manually test by the following steps:
1. SET `SPARK_WORKER_INSTANCES=1` in `conf/spark-env.sh`;
2. SET `spark.shuffle.service.enabled` to `true` in `conf/spark-defaults.conf`;
3. Run `sbin/start-all.sh`.

Before the change, you will see no error in the command line, as the following:
```
starting org.apache.spark.deploy.master.Master, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.master.Master-1-xxx.local.out
localhost: starting org.apache.spark.deploy.worker.Worker, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.worker.Worker-1-xxx.local.out
localhost: starting org.apache.spark.deploy.worker.Worker, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.worker.Worker-2-xxx.local.out
localhost: starting org.apache.spark.deploy.worker.Worker, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.worker.Worker-3-xxx.local.out
```
And you can see in the webUI that only one worker is running.

After the change, you get explicit error messages in the command line:
```
starting org.apache.spark.deploy.master.Master, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.master.Master-1-xxx.local.out
localhost: starting org.apache.spark.deploy.worker.Worker, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.worker.Worker-1-xxx.local.out
localhost: failed to launch: nice -n 0 
/Users/xxx/workspace/spark/bin/spark-class 
org.apache.spark.deploy.worker.Worker --webui-port 8081 spark://xxx.local:7077
localhost:   17/06/13 23:24:53 INFO SecurityManager: Changing view acls to: xxx
localhost:   17/06/13 23:24:53 INFO SecurityManager: Changing modify acls to: 
xxx
localhost:   17/06/13 23:24:53 INFO SecurityManager: Changing view acls groups 
to:
localhost:   17/06/13 23:24:53 INFO SecurityManager: Changing modify acls 
groups to:
localhost:   17/06/13 23:24:53 INFO SecurityManager: SecurityManager: 
authentication disabled; ui acls disabled; users  with view permissions: 
Set(xxx); groups with view permissions: Set(); users  with modify permissions: 
Set(xxx); groups with modify permissions: Set()
localhost:   17/06/13 23:24:54 INFO Utils: Successfully started service 
'sparkWorker' on port 63354.
localhost:   Exception in thread "main" java.lang.IllegalArgumentException: 
requirement failed: Start multiple worker on one host failed because we may 
launch no more than one external shuffle service on each host, please set 
spark.shuffle.service.enabled to false or set SPARK_WORKER_INSTANCES to 1 to 
resolve the conflict.
localhost:  at scala.Predef$.require(Predef.scala:224)
localhost:  at org.apache.spark.deploy.worker.Worker$.main(Worker.scala:752)
localhost:  at org.apache.spark.deploy.worker.Worker.main(Worker.scala)
localhost: full log in 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.worker.Worker-1-xxx.local.out
localhost: starting org.apache.spark.deploy.worker.Worker, logging to 
/Users/xxx/workspace/spark/logs/spark-xxx-org.apache.spark.deploy.worker.Worker-2-xxx.local.out
localhost: failed to launch: nice -n 0 
/Users/xxx/workspace/spark/bin/spark-class 
org.apache.spark.deploy.worker.Worker --webui-port 8082 spark://xxx.local:7077
localhost:   17/06/13 23:24:56 INFO SecurityManager: Changing view acls to: xxx
localhost:   17/06/13 23:24:56 INFO SecurityManager: Changing modify acls to: 
xxx
localhost:   17/06/13 23:24:56 INFO SecurityManager: Changing view acls groups 
to:
localhost:   17/06/13 23:24:56 INFO SecurityManager: Changing modify acls 
groups to:
localhost:   17/06/13 23:24:56 INFO SecurityManager: SecurityManager: 
authentication disabled; ui acls disabled; users  with view permissions: 
Set(xxx); groups with view permissions: Set(); users  with modify permissions: 
Set(xxx); groups with modify permissions: Set()
localhost:   17/06/13 23:24:56 INFO Utils: Successfully started service 
'sparkWorker' on port 63359.

spark git commit: [SPARK-20929][ML] LinearSVC should use its own threshold param

2017-06-20 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 8965fe764 -> cc67bd573


[SPARK-20929][ML] LinearSVC should use its own threshold param

## What changes were proposed in this pull request?

LinearSVC should use its own threshold param, rather than the shared one, since 
it applies to rawPrediction instead of probability.  This PR changes the param 
in the Scala, Python and R APIs.

## How was this patch tested?

New unit test to make sure the threshold can be set to any Double value.

Author: Joseph K. Bradley 

Closes #18151 from jkbradley/ml-2.2-linearsvc-cleanup.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc67bd57
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc67bd57
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc67bd57

Branch: refs/heads/master
Commit: cc67bd573264c9046c4a034927ed8deb2a732110
Parents: 8965fe7
Author: Joseph K. Bradley 
Authored: Mon Jun 19 23:04:17 2017 -0700
Committer: Joseph K. Bradley 
Committed: Mon Jun 19 23:04:17 2017 -0700

--
 R/pkg/R/mllib_classification.R  |  4 ++-
 .../spark/ml/classification/LinearSVC.scala | 25 --
 .../ml/classification/LinearSVCSuite.scala  | 35 +++-
 python/pyspark/ml/classification.py | 20 ++-
 4 files changed, 79 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cc67bd57/R/pkg/R/mllib_classification.R
--
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index 306a9b8..bdcc081 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -62,7 +62,9 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #'of models will be always returned on the original 
scale, so it will be transparent for
 #'users. Note that with/without standardization, the 
models should be always converged
 #'to the same solution when no regularization is 
applied.
-#' @param threshold The threshold in binary classification, in range [0, 1].
+#' @param threshold The threshold in binary classification applied to the 
linear model prediction.
+#'  This threshold can be any real number, where Inf will make 
all predictions 0.0
+#'  and -Inf will make all predictions 1.0.
 #' @param weightCol The weight column name.
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
 #' or the number of partitions are large, this param 
could be adjusted to a larger size.

http://git-wip-us.apache.org/repos/asf/spark/blob/cc67bd57/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 9900fbc..d6ed6a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -42,7 +42,23 @@ import org.apache.spark.sql.functions.{col, lit}
 /** Params for linear SVM Classifier. */
 private[classification] trait LinearSVCParams extends ClassifierParams with 
HasRegParam
   with HasMaxIter with HasFitIntercept with HasTol with HasStandardization 
with HasWeightCol
-  with HasThreshold with HasAggregationDepth
+  with HasAggregationDepth {
+
+  /**
+   * Param for threshold in binary classification prediction.
+   * For LinearSVC, this threshold is applied to the rawPrediction, rather 
than a probability.
+   * This threshold can be any real number, where Inf will make all 
predictions 0.0
+   * and -Inf will make all predictions 1.0.
+   * Default: 0.0
+   *
+   * @group param
+   */
+  final val threshold: DoubleParam = new DoubleParam(this, "threshold",
+"threshold in binary classification prediction applied to rawPrediction")
+
+  /** @group getParam */
+  def getThreshold: Double = $(threshold)
+}
 
 /**
  * :: Experimental ::
@@ -126,7 +142,7 @@ class LinearSVC @Since("2.2.0") (
   def setWeightCol(value: String): this.type = set(weightCol, value)
 
   /**
-   * Set threshold in binary classification, in range [0, 1].
+   * Set threshold in binary classification.
*
* @group setParam
*/
@@ -284,6 +300,7 @@ class LinearSVCModel private[classification] (
 
   @Since("2.2.0")
   def setThreshold(value: Double): this.type = set(threshold, value)
+  setDefault(threshold, 0.0)
 
   @Since("2.2.0")
   def setWeightCol(value:

spark git commit: [SPARK-20929][ML] LinearSVC should use its own threshold param

2017-06-20 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.2 8bf7f1eba -> 514a7e6f8


[SPARK-20929][ML] LinearSVC should use its own threshold param

## What changes were proposed in this pull request?

LinearSVC should use its own threshold param, rather than the shared one, since 
it applies to rawPrediction instead of probability.  This PR changes the param 
in the Scala, Python and R APIs.

## How was this patch tested?

New unit test to make sure the threshold can be set to any Double value.

Author: Joseph K. Bradley 

Closes #18151 from jkbradley/ml-2.2-linearsvc-cleanup.

(cherry picked from commit cc67bd573264c9046c4a034927ed8deb2a732110)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/514a7e6f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/514a7e6f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/514a7e6f

Branch: refs/heads/branch-2.2
Commit: 514a7e6f8a11801c0c1e040796816f154480e75e
Parents: 8bf7f1e
Author: Joseph K. Bradley 
Authored: Mon Jun 19 23:04:17 2017 -0700
Committer: Joseph K. Bradley 
Committed: Mon Jun 19 23:04:27 2017 -0700

--
 R/pkg/R/mllib_classification.R  |  4 ++-
 .../spark/ml/classification/LinearSVC.scala | 25 --
 .../ml/classification/LinearSVCSuite.scala  | 35 +++-
 python/pyspark/ml/classification.py | 20 ++-
 4 files changed, 79 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/514a7e6f/R/pkg/R/mllib_classification.R
--
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
index 306a9b8..bdcc081 100644
--- a/R/pkg/R/mllib_classification.R
+++ b/R/pkg/R/mllib_classification.R
@@ -62,7 +62,9 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
 #'of models will be always returned on the original 
scale, so it will be transparent for
 #'users. Note that with/without standardization, the 
models should be always converged
 #'to the same solution when no regularization is 
applied.
-#' @param threshold The threshold in binary classification, in range [0, 1].
+#' @param threshold The threshold in binary classification applied to the 
linear model prediction.
+#'  This threshold can be any real number, where Inf will make 
all predictions 0.0
+#'  and -Inf will make all predictions 1.0.
 #' @param weightCol The weight column name.
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal 
to 2). If the dimensions of features
 #' or the number of partitions are large, this param 
could be adjusted to a larger size.

http://git-wip-us.apache.org/repos/asf/spark/blob/514a7e6f/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
index 9900fbc..d6ed6a4 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -42,7 +42,23 @@ import org.apache.spark.sql.functions.{col, lit}
 /** Params for linear SVM Classifier. */
 private[classification] trait LinearSVCParams extends ClassifierParams with 
HasRegParam
   with HasMaxIter with HasFitIntercept with HasTol with HasStandardization 
with HasWeightCol
-  with HasThreshold with HasAggregationDepth
+  with HasAggregationDepth {
+
+  /**
+   * Param for threshold in binary classification prediction.
+   * For LinearSVC, this threshold is applied to the rawPrediction, rather 
than a probability.
+   * This threshold can be any real number, where Inf will make all 
predictions 0.0
+   * and -Inf will make all predictions 1.0.
+   * Default: 0.0
+   *
+   * @group param
+   */
+  final val threshold: DoubleParam = new DoubleParam(this, "threshold",
+"threshold in binary classification prediction applied to rawPrediction")
+
+  /** @group getParam */
+  def getThreshold: Double = $(threshold)
+}
 
 /**
  * :: Experimental ::
@@ -126,7 +142,7 @@ class LinearSVC @Since("2.2.0") (
   def setWeightCol(value: String): this.type = set(weightCol, value)
 
   /**
-   * Set threshold in binary classification, in range [0, 1].
+   * Set threshold in binary classification.
*
* @group setParam
*/
@@ -284,6 +300,7 @@ class LinearSVCModel private[classification] (
 
   @Since("2.2.0")
   def

spark git commit: [SPARK-10655][SQL] Adding additional data type mappings to jdbc DB2dialect.

spark-website git commit: Add a note for searching PySpark and SparkR version changes in release-process.md

spark-website git commit: Add .DS_Store in .gitignore

spark git commit: [SPARK-21123][DOCS][STRUCTURED STREAMING] Options for file stream source are in a wrong table - version to fix 2.1

spark git commit: [SPARK-21103][SQL] QueryPlanConstraints should be part of LogicalPlan

[1/2] spark git commit: Preparing Spark release v2.2.0-rc5

[2/2] spark git commit: Preparing development version 2.2.1-SNAPSHOT

[spark] Git Push Summary

spark git commit: [SPARK-21150][SQL] Persistent view stored in Hive metastore should be case preserving

spark git commit: [SPARK-21150][SQL] Persistent view stored in Hive metastore should be case preserving

spark git commit: [SPARK-20989][CORE] Fail to start multiple workers on one host if external shuffle service is enabled in standalone mode

spark git commit: [SPARK-20929][ML] LinearSVC should use its own threshold param

spark git commit: [SPARK-20929][ML] LinearSVC should use its own threshold param

13 matches

Site Navigation

Mail list logo

Footer information