[GitHub] spark pull request: [SPARK-14412][ML][PYSPARK] Add StorageLevel pa...

MLnick Thu, 28 Apr 2016 01:43:16 -0700

Github user MLnick commented on a diff in the pull request:

    https://github.com/apache/spark/pull/12660#discussion_r61392206
  
    --- Diff: 
mllib/src/test/scala/org/apache/spark/ml/recommendation/ALSSuite.scala ---
    @@ -512,6 +514,55 @@ class ALSSuite
         assert(getFactors(model.userFactors) === 
getFactors(model2.userFactors))
         assert(getFactors(model.itemFactors) === 
getFactors(model2.itemFactors))
       }
    +
    +  test("StorageLevel param") {
    +    // test invalid param values
    +    intercept[IllegalArgumentException] {
    +      new ALS().setIntermediateRDDStorageLevel("foo")
    +    }
    +    intercept[IllegalArgumentException] {
    +      new ALS().setIntermediateRDDStorageLevel("NONE")
    +    }
    +    intercept[IllegalArgumentException] {
    +      new ALS().setFinalRDDStorageLevel("foo")
    +    }
    +    // test StorageLevels
    +    val sqlContext = this.sqlContext
    +    import sqlContext.implicits._
    +    val (ratings, _) = genExplicitTestData(numUsers = 2, numItems = 2, 
rank = 1)
    +    val data = ratings.toDF
    +    val als = new ALS().setMaxIter(1)
    +    als.fit(data)
    +    val factorRDD = sc.getPersistentRDDs.collect {
    +      case (id, rdd) if rdd.name == "userFactors" => rdd
    +    }.head
    +    assert(factorRDD.getStorageLevel == StorageLevel.MEMORY_AND_DISK)
    +    val listener = new RDDStorageListener
    +    sc.addSparkListener(listener)
    +    als
    +      .setFinalRDDStorageLevel("MEMORY_ONLY")
    +      .setIntermediateRDDStorageLevel("DISK_ONLY")
    +      .fit(data)
    +    val level = sc.getRDDStorageInfo { rdd =>
    +      rdd.name == "userFactors" && rdd.id != factorRDD.id
    +    }.head.storageLevel
    +    assert(level == StorageLevel.MEMORY_ONLY)
    +    listener.infos.foreach(level => assert(level == 
StorageLevel.DISK_ONLY))
    +  }
    +}
    +
    +private class RDDStorageListener extends SparkListener {
    +
    +  var infos: Seq[StorageLevel] = Seq()
    +
    +  override def onStageCompleted(stageCompleted: 
SparkListenerStageCompleted): Unit = {
    +    val info = stageCompleted.stageInfo.rddInfos.collect {
    +      case info if info.name.contains("Blocks") || 
info.name.contains("Factors-") =>
    +        info.storageLevel
    +    }
    +    infos = info
    --- End diff --
    
    I can amend it to collect all the intermediates, but I set maxIter to `1` 
so it should only have 1 iterations worth. In any case I'll amend this slightly 
to append to `infos`, that way if any `stageInfo` contains the intermediate RDD 
infos, it will be appended otherwise not.



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request: [SPARK-14412][ML][PYSPARK] Add StorageLevel pa...

Reply via email to