Repository: spark Updated Branches: refs/heads/master bd9655063 -> b264cbb16
[SPARK-15113][PYSPARK][ML] Add missing num features num classes ## What changes were proposed in this pull request? Add missing `numFeatures` and `numClasses` to the wrapped Java models in PySpark ML pipelines. Also tag `DecisionTreeClassificationModel` as Expiremental to match Scala doc. ## How was this patch tested? Extended doctests Author: Holden Karau <hol...@us.ibm.com> Closes #12889 from holdenk/SPARK-15113-add-missing-numFeatures-numClasses. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b264cbb1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b264cbb1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b264cbb1 Branch: refs/heads/master Commit: b264cbb16fb97116e630fb593adf5898a5a0e8fa Parents: bd96550 Author: Holden Karau <hol...@us.ibm.com> Authored: Mon Aug 22 12:21:22 2016 +0200 Committer: Nick Pentreath <ni...@za.ibm.com> Committed: Mon Aug 22 12:21:22 2016 +0200 ---------------------------------------------------------------------- .../GeneralizedLinearRegression.scala | 2 ++ python/pyspark/ml/classification.py | 37 ++++++++++++++++---- python/pyspark/ml/regression.py | 22 +++++++++--- python/pyspark/ml/util.py | 16 +++++++++ 4 files changed, 66 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b264cbb1/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala index 2bdc09e..1d4dfd1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala @@ -788,6 +788,8 @@ class GeneralizedLinearRegressionModel private[ml] ( @Since("2.0.0") override def write: MLWriter = new GeneralizedLinearRegressionModel.GeneralizedLinearRegressionModelWriter(this) + + override val numFeatures: Int = coefficients.size } @Since("2.0.0") http://git-wip-us.apache.org/repos/asf/spark/blob/b264cbb1/python/pyspark/ml/classification.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 6468007..33ada27 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -44,6 +44,23 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', @inherit_doc +class JavaClassificationModel(JavaPredictionModel): + """ + (Private) Java Model produced by a ``Classifier``. + Classes are indexed {0, 1, ..., numClasses - 1}. + To be mixed in with class:`pyspark.ml.JavaModel` + """ + + @property + @since("2.1.0") + def numClasses(self): + """ + Number of classes (values which the label can take). + """ + return self._call_java("numClasses") + + +@inherit_doc class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol, HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds, @@ -212,7 +229,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti " threshold (%g) and thresholds (equivalent to %g)" % (t2, t)) -class LogisticRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): """ Model fitted by LogisticRegression. @@ -522,6 +539,10 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred 1 >>> model.featureImportances SparseVector(1, {0: 1.0}) + >>> model.numFeatures + 1 + >>> model.numClasses + 2 >>> print(model.toDebugString) DecisionTreeClassificationModel (uid=...) of depth 1 with 3 nodes... >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) @@ -595,7 +616,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred @inherit_doc -class DecisionTreeClassificationModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable): +class DecisionTreeClassificationModel(DecisionTreeModel, JavaClassificationModel, JavaMLWritable, + JavaMLReadable): """ Model fitted by DecisionTreeClassifier. @@ -722,7 +744,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred return RandomForestClassificationModel(java_model) -class RandomForestClassificationModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): +class RandomForestClassificationModel(TreeEnsembleModel, JavaClassificationModel, JavaMLWritable, + JavaMLReadable): """ Model fitted by RandomForestClassifier. @@ -873,7 +896,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol return self.getOrDefault(self.lossType) -class GBTClassificationModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): +class GBTClassificationModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, + JavaMLReadable): """ Model fitted by GBTClassifier. @@ -1027,7 +1051,7 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H return self.getOrDefault(self.modelType) -class NaiveBayesModel(JavaModel, JavaMLWritable, JavaMLReadable): +class NaiveBayesModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): """ Model fitted by NaiveBayes. @@ -1226,7 +1250,8 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, return self.getOrDefault(self.initialWeights) -class MultilayerPerceptronClassificationModel(JavaModel, JavaMLWritable, JavaMLReadable): +class MultilayerPerceptronClassificationModel(JavaModel, JavaPredictionModel, JavaMLWritable, + JavaMLReadable): """ .. note:: Experimental http://git-wip-us.apache.org/repos/asf/spark/blob/b264cbb1/python/pyspark/ml/regression.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 1ae2bd4..56312f6 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -88,6 +88,8 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction True >>> model.intercept == model2.intercept True + >>> model.numFeatures + 1 .. versionadded:: 1.4.0 """ @@ -126,7 +128,7 @@ class LinearRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPrediction return LinearRegressionModel(java_model) -class LinearRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class LinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`LinearRegression`. @@ -654,6 +656,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi 3 >>> model.featureImportances SparseVector(1, {0: 1.0}) + >>> model.numFeatures + 1 >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 @@ -719,7 +723,7 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi @inherit_doc -class DecisionTreeModel(JavaModel): +class DecisionTreeModel(JavaModel, JavaPredictionModel): """ Abstraction for Decision Tree models. @@ -843,6 +847,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) >>> model.transform(test0).head().prediction 0.0 + >>> model.numFeatures + 1 >>> model.trees [DecisionTreeRegressionModel (uid=...) of depth..., DecisionTreeRegressionModel...] >>> model.getNumTrees @@ -909,7 +915,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredi return RandomForestRegressionModel(java_model) -class RandomForestRegressionModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): +class RandomForestRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, + JavaMLReadable): """ Model fitted by :class:`RandomForestRegressor`. @@ -958,6 +965,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> model = gbt.fit(df) >>> model.featureImportances SparseVector(1, {0: 1.0}) + >>> model.numFeatures + 1 >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True >>> test0 = spark.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) @@ -1047,7 +1056,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, return self.getOrDefault(self.lossType) -class GBTRegressionModel(TreeEnsembleModel, JavaMLWritable, JavaMLReadable): +class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, JavaMLWritable, JavaMLReadable): """ Model fitted by :class:`GBTRegressor`. @@ -1307,6 +1316,8 @@ class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, Ha True >>> model.coefficients DenseVector([1.5..., -1.0...]) + >>> model.numFeatures + 2 >>> abs(model.intercept - 1.5) < 0.001 True >>> glr_path = temp_path + "/glr" @@ -1412,7 +1423,8 @@ class GeneralizedLinearRegression(JavaEstimator, HasLabelCol, HasFeaturesCol, Ha return self.getOrDefault(self.link) -class GeneralizedLinearRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): +class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, JavaMLWritable, + JavaMLReadable): """ .. note:: Experimental http://git-wip-us.apache.org/repos/asf/spark/blob/b264cbb1/python/pyspark/ml/util.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/util.py b/python/pyspark/ml/util.py index 4a31a29..7d39c30 100644 --- a/python/pyspark/ml/util.py +++ b/python/pyspark/ml/util.py @@ -238,3 +238,19 @@ class JavaMLReadable(MLReadable): def read(cls): """Returns an MLReader instance for this class.""" return JavaMLReader(cls) + + +@inherit_doc +class JavaPredictionModel(): + """ + (Private) Java Model for prediction tasks (regression and classification). + To be mixed in with class:`pyspark.ml.JavaModel` + """ + + @property + @since("2.1.0") + def numFeatures(self): + """ + Returns the number of features the model was trained on. If unknown, returns -1 + """ + return self._call_java("numFeatures") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org