Repository: spark Updated Branches: refs/heads/master ce1798b3a -> d0f695089
[SPARK-12349][ML] Make spark.ml PCAModel load backwards compatible Only load explainedVariance in PCAModel if it was written with Spark > 1.6.x jkbradley is this kind of what you had in mind? Author: Sean Owen <so...@cloudera.com> Closes #10327 from srowen/SPARK-12349. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d0f69508 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d0f69508 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d0f69508 Branch: refs/heads/master Commit: d0f695089e4627273133c5f49ef7a83c1840c8f5 Parents: ce1798b Author: Sean Owen <so...@cloudera.com> Authored: Mon Dec 21 10:21:22 2015 +0000 Committer: Sean Owen <so...@cloudera.com> Committed: Mon Dec 21 10:21:22 2015 +0000 ---------------------------------------------------------------------- .../scala/org/apache/spark/ml/feature/PCA.scala | 33 +++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/d0f69508/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 53d33ea..759be81 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -167,14 +167,37 @@ object PCAModel extends MLReadable[PCAModel] { private val className = classOf[PCAModel].getName + /** + * Loads a [[PCAModel]] from data located at the input path. Note that the model includes an + * `explainedVariance` member that is not recorded by Spark 1.6 and earlier. A model + * can be loaded from such older data but will have an empty vector for + * `explainedVariance`. + * + * @param path path to serialized model data + * @return a [[PCAModel]] + */ override def load(path: String): PCAModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) + + // explainedVariance field is not present in Spark <= 1.6 + val versionRegex = "([0-9]+)\\.([0-9])+.*".r + val hasExplainedVariance = metadata.sparkVersion match { + case versionRegex(major, minor) => + (major.toInt >= 2 || (major.toInt == 1 && minor.toInt > 6)) + case _ => false + } + val dataPath = new Path(path, "data").toString - val Row(pc: DenseMatrix, explainedVariance: DenseVector) = - sqlContext.read.parquet(dataPath) - .select("pc", "explainedVariance") - .head() - val model = new PCAModel(metadata.uid, pc, explainedVariance) + val model = if (hasExplainedVariance) { + val Row(pc: DenseMatrix, explainedVariance: DenseVector) = + sqlContext.read.parquet(dataPath) + .select("pc", "explainedVariance") + .head() + new PCAModel(metadata.uid, pc, explainedVariance) + } else { + val Row(pc: DenseMatrix) = sqlContext.read.parquet(dataPath).select("pc").head() + new PCAModel(metadata.uid, pc, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]) + } DefaultParamsReader.getAndSetParams(model, metadata) model } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org