spark git commit: [SPARK-18608][ML] Fix double caching

2017-09-12 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/branch-2.2 63098dc31 -> b606dc177


[SPARK-18608][ML] Fix double caching

## What changes were proposed in this pull request?
`df.rdd.getStorageLevel` => `df.storageLevel`

using cmd `find . -name '*.scala' | xargs -i bash -c 'egrep -in 
"\.rdd\.getStorageLevel" {} && echo {}'` to make sure all algs involved in this 
issue are fixed.

Previous discussion in other PRs: https://github.com/apache/spark/pull/19107, 
https://github.com/apache/spark/pull/17014

## How was this patch tested?
existing tests

Author: Zheng RuiFeng 

Closes #19197 from zhengruifeng/double_caching.

(cherry picked from commit c5f9b89dda40ffaa4622a7ba2b3d0605dbe815c0)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b606dc17
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b606dc17
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b606dc17

Branch: refs/heads/branch-2.2
Commit: b606dc177e177bdbf99e72638eb8baec12e9fb53
Parents: 63098dc
Author: Zheng RuiFeng 
Authored: Tue Sep 12 11:37:05 2017 -0700
Committer: Joseph K. Bradley 
Committed: Tue Sep 12 11:37:22 2017 -0700

--
 .../org/apache/spark/ml/classification/LogisticRegression.scala  | 2 +-
 .../scala/org/apache/spark/ml/classification/OneVsRest.scala | 4 ++--
 mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala | 2 +-
 .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala   | 2 +-
 .../org/apache/spark/ml/regression/IsotonicRegression.scala  | 2 +-
 .../scala/org/apache/spark/ml/regression/LinearRegression.scala  | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b606dc17/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 1de2373..e7f99fc 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -483,7 +483,7 @@ class LogisticRegression @Since("1.2.0") (
   }
 
   override protected[spark] def train(dataset: Dataset[_]): 
LogisticRegressionModel = {
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 train(dataset, handlePersistence)
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b606dc17/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 05b8c3a..f3aff4c 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -164,7 +164,7 @@ final class OneVsRestModel private[ml] (
 val newDataset = dataset.withColumn(accColName, initUDF())
 
 // persist if underlying dataset is not persistent.
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 if (handlePersistence) {
   newDataset.persist(StorageLevel.MEMORY_AND_DISK)
 }
@@ -347,7 +347,7 @@ final class OneVsRest @Since("1.4.0") (
 }
 
 // persist if underlying dataset is not persistent.
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 if (handlePersistence) {
   multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/b606dc17/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index e02b532..f2af7fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -304,7 +304,7 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = {
 transformSchema(dataset.schema, logging = true)
 
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 val i

spark git commit: [SPARK-18608][ML] Fix double caching

2017-09-12 Thread jkbradley
Repository: spark
Updated Branches:
  refs/heads/master b9b54b1c8 -> c5f9b89dd


[SPARK-18608][ML] Fix double caching

## What changes were proposed in this pull request?
`df.rdd.getStorageLevel` => `df.storageLevel`

using cmd `find . -name '*.scala' | xargs -i bash -c 'egrep -in 
"\.rdd\.getStorageLevel" {} && echo {}'` to make sure all algs involved in this 
issue are fixed.

Previous discussion in other PRs: https://github.com/apache/spark/pull/19107, 
https://github.com/apache/spark/pull/17014

## How was this patch tested?
existing tests

Author: Zheng RuiFeng 

Closes #19197 from zhengruifeng/double_caching.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5f9b89d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5f9b89d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5f9b89d

Branch: refs/heads/master
Commit: c5f9b89dda40ffaa4622a7ba2b3d0605dbe815c0
Parents: b9b54b1
Author: Zheng RuiFeng 
Authored: Tue Sep 12 11:37:05 2017 -0700
Committer: Joseph K. Bradley 
Committed: Tue Sep 12 11:37:05 2017 -0700

--
 .../org/apache/spark/ml/classification/LogisticRegression.scala  | 2 +-
 .../scala/org/apache/spark/ml/classification/OneVsRest.scala | 4 ++--
 mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala | 2 +-
 .../org/apache/spark/ml/regression/AFTSurvivalRegression.scala   | 2 +-
 .../org/apache/spark/ml/regression/IsotonicRegression.scala  | 2 +-
 .../scala/org/apache/spark/ml/regression/LinearRegression.scala  | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c5f9b89d/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f491a67..cbc8f4a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -484,7 +484,7 @@ class LogisticRegression @Since("1.2.0") (
   }
 
   override protected[spark] def train(dataset: Dataset[_]): 
LogisticRegressionModel = {
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 train(dataset, handlePersistence)
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/c5f9b89d/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index 99bb234..942e981 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -165,7 +165,7 @@ final class OneVsRestModel private[ml] (
 val newDataset = dataset.withColumn(accColName, initUDF())
 
 // persist if underlying dataset is not persistent.
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 if (handlePersistence) {
   newDataset.persist(StorageLevel.MEMORY_AND_DISK)
 }
@@ -358,7 +358,7 @@ final class OneVsRest @Since("1.4.0") (
 }
 
 // persist if underlying dataset is not persistent.
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 if (handlePersistence) {
   multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/c5f9b89d/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index e02b532..f2af7fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -304,7 +304,7 @@ class KMeans @Since("1.5.0") (
   override def fit(dataset: Dataset[_]): KMeansModel = {
 transformSchema(dataset.schema, logging = true)
 
-val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE
+val handlePersistence = dataset.storageLevel == StorageLevel.NONE
 val instances: RDD[OldVector] = 
dataset.select(col($(featuresCol))).rdd.map {
   case Row(point: Vector) => OldV