[GitHub] spark pull request: [SPARK-11612] [ML] Pipeline and PipelineModel ...

mengxr Mon, 16 Nov 2015 13:38:10 -0800

Github user mengxr commented on a diff in the pull request:

    https://github.com/apache/spark/pull/9674#discussion_r44987110
  
    --- Diff: mllib/src/main/scala/org/apache/spark/ml/Pipeline.scala ---
    @@ -200,4 +235,121 @@ class PipelineModel private[ml] (
       override def copy(extra: ParamMap): PipelineModel = {
         new PipelineModel(uid, stages.map(_.copy(extra))).setParent(parent)
       }
    +
    +  override def write: Writer = new PipelineModelWriter(this)
    +}
    +
    +object PipelineModel extends Readable[PipelineModel] {
    +
    +  override def read: Reader[PipelineModel] = new PipelineModelReader
    +
    +  override def load(path: String): PipelineModel = read.load(path)
    +}
    +
    +private[ml] class PipelineModelWriter(instance: PipelineModel) extends 
Writer {
    +
    +  
PipelineSharedWriter.validateStages(instance.stages.asInstanceOf[Array[PipelineStage]])
    +
    +  override protected def saveImpl(path: String): Unit = 
PipelineSharedWriter.saveImpl(instance,
    +    instance.stages.asInstanceOf[Array[PipelineStage]], sc, path)
    +}
    +
    +private[ml] class PipelineModelReader extends Reader[PipelineModel] {
    +
    +  /** Checked against metadata when loading model */
    +  private val className = "org.apache.spark.ml.PipelineModel"
    +
    +  override def load(path: String): PipelineModel = {
    +    val (uid: String, stages: Array[PipelineStage]) =
    +      PipelineSharedReader.load(className, sc, path)
    +    val transformers = stages map {
    +      case stage: Transformer => stage
    +      case stage => throw new RuntimeException(s"PipelineModel.read loaded 
a stage but found it" +
    +        s" was not a Transformer.  Bad stage: ${stage.uid}")
    +    }
    +    new PipelineModel(uid, transformers)
    +  }
    +}
    +
    +/** Methods for [[Writer]] shared between [[Pipeline]] and 
[[PipelineModel]] */
    +private[ml] object PipelineSharedWriter {
    +
    +  import org.json4s.JsonDSL._
    +
    +  /** Check that all stages are Writable */
    +  def validateStages(stages: Array[PipelineStage]): Unit = {
    +    stages.foreach {
    +      case stage: Writable => // good
    +      case stage =>
    +        throw new UnsupportedOperationException("Pipeline write will fail 
on this Pipeline" +
    +          s" because it contains a stage which does not implement 
Writable. Non-Writable stage:" +
    +          s" ${stage.uid}")
    +    }
    +  }
    +
    +  def saveImpl(
    +      instance: Params,
    +      stages: Array[PipelineStage],
    +      sc: SparkContext,
    +      path: String): Unit = {
    +    // Copied and edited from DefaultParamsWriter.saveMetadata
    +    // TODO: modify DefaultParamsWriter.saveMetadata to avoid duplication
    +    val uid = instance.uid
    +    val cls = instance.getClass.getName
    +    val stageUids = stages.map(_.uid)
    +    val jsonParams = List("stageUids" -> 
parse(compact(render(stageUids.toSeq))))
    +    val metadata = ("class" -> cls) ~
    +      ("timestamp" -> System.currentTimeMillis()) ~
    +      ("sparkVersion" -> sc.version) ~
    +      ("uid" -> uid) ~
    +      ("paramMap" -> jsonParams)
    +    val metadataPath = new Path(path, "metadata").toString
    +    val metadataJson = compact(render(metadata))
    +    sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath)
    +
    +    // Save stages
    +    val stagesDir = new Path(path, "stages").toString
    +    stages.foreach {
    +      case stage: Writable =>
    +        val stagePath = new Path(stagesDir, stage.uid).toString
    +        stage.write.save(stagePath)
    +    }
    +  }
    +}
    +
    +/** Methods for [[Reader]] shared between [[Pipeline]] and 
[[PipelineModel]] */
    +private[ml] object PipelineSharedReader {
    +
    +  def load(className: String, sc: SparkContext, path: String): (String, 
Array[PipelineStage]) = {
    +    val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
    +
    +    implicit val format = DefaultFormats
    +    val stagesDir = new Path(path, "stages").toString
    +    val stageUids: Array[String] = metadata.params match {
    +      case JObject(pairs) =>
    +        if (pairs.length != 1) {
    +          // Should not happen unless file is corrupted or we have a bug.
    +          throw new RuntimeException(
    +            s"Pipeline read expected 1 Param (stageUids), but found 
${pairs.length}.")
    +        }
    +        pairs.head match {
    +          case ("stageUids", jsonValue) =>
    +            parse(compact(render(jsonValue))).extract[Seq[String]].toArray
    --- End diff --
    
    Would `jsonValue.extract[Seq[String]].toArray` work?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request: [SPARK-11612] [ML] Pipeline and PipelineModel ...

Reply via email to