spark git commit: [SPARK-23161][PYSPARK][ML] Add missing APIs to Python GBTClassifier

cutlerb Wed, 30 May 2018 11:05:08 -0700

Repository: spark
Updated Branches:
  refs/heads/master b142157dc -> ec6f971dc



[SPARK-23161][PYSPARK][ML] Add missing APIs to Python GBTClassifier

## What changes were proposed in this pull request?

Add featureSubsetStrategy in GBTClassifier and GBTRegressor.  Also make 
GBTClassificationModel inherit from JavaClassificationModel instead of 
prediction model so it will have numClasses.

## How was this patch tested?

Add tests in doctest

Author: Huaxin Gao <huax...@us.ibm.com>

Closes #21413 from huaxingao/spark-23161.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec6f971d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec6f971d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec6f971d

Branch: refs/heads/master
Commit: ec6f971dc57bcdc0ad65ac1987b6f0c1801157f4
Parents: b142157
Author: Huaxin Gao <huax...@us.ibm.com>
Authored: Wed May 30 11:04:09 2018 -0700
Committer: Bryan Cutler <cutl...@gmail.com>
Committed: Wed May 30 11:04:09 2018 -0700

----------------------------------------------------------------------
 python/pyspark/ml/classification.py | 35 +++++++++++++---
 python/pyspark/ml/regression.py     | 70 ++++++++++++++++++++------------
 2 files changed, 74 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/ec6f971d/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 424ecfd..1754c48 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -1131,6 +1131,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
     def _create_model(self, java_model):
         return RandomForestClassificationModel(java_model)
 
+    @since("2.4.0")
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+        """
+        return self._set(featureSubsetStrategy=value)
+
 
 class RandomForestClassificationModel(TreeEnsembleModel, 
JavaClassificationModel, JavaMLWritable,
                                       JavaMLReadable):
@@ -1193,6 +1200,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol
     >>> si_model = stringIndexer.fit(df)
     >>> td = si_model.transform(df)
     >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42)
+    >>> gbt.getFeatureSubsetStrategy()
+    'all'
     >>> model = gbt.fit(td)
     >>> model.featureImportances
     SparseVector(1, {0: 1.0})
@@ -1226,6 +1235,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol
     ...              ["indexed", "features"])
     >>> model.evaluateEachIteration(validation)
     [0.25..., 0.23..., 0.21..., 0.19..., 0.18...]
+    >>> model.numClasses
+    2
 
     .. versionadded:: 1.4.0
     """
@@ -1244,19 +1255,22 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol
     def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
lossType="logistic",
-                 maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0):
+                 maxIter=20, stepSize=0.1, seed=None, subsamplingRate=1.0,
+                 featureSubsetStrategy="all"):
         """
         __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
\
-                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None, 
subsamplingRate=1.0)
+                 lossType="logistic", maxIter=20, stepSize=0.1, seed=None, 
subsamplingRate=1.0, \
+                 featureSubsetStrategy="all")
         """
         super(GBTClassifier, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.classification.GBTClassifier", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, 
checkpointInterval=10,
-                         lossType="logistic", maxIter=20, stepSize=0.1, 
subsamplingRate=1.0)
+                         lossType="logistic", maxIter=20, stepSize=0.1, 
subsamplingRate=1.0,
+                         featureSubsetStrategy="all")
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
@@ -1265,12 +1279,14 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol
     def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10,
-                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, 
subsamplingRate=1.0):
+                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, 
subsamplingRate=1.0,
+                  featureSubsetStrategy="all"):
         """
         setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, 
checkpointInterval=10, \
-                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, 
subsamplingRate=1.0)
+                  lossType="logistic", maxIter=20, stepSize=0.1, seed=None, 
subsamplingRate=1.0, \
+                  featureSubsetStrategy="all")
         Sets params for Gradient Boosted Tree Classification.
         """
         kwargs = self._input_kwargs
@@ -1293,8 +1309,15 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol
         """
         return self.getOrDefault(self.lossType)
 
+    @since("2.4.0")
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+        """
+        return self._set(featureSubsetStrategy=value)
+
 
-class GBTClassificationModel(TreeEnsembleModel, JavaPredictionModel, 
JavaMLWritable,
+class GBTClassificationModel(TreeEnsembleModel, JavaClassificationModel, 
JavaMLWritable,
                              JavaMLReadable):
     """
     Model fitted by GBTClassifier.

http://git-wip-us.apache.org/repos/asf/spark/blob/ec6f971d/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index dd0b62f..dba0e57 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -602,6 +602,14 @@ class TreeEnsembleParams(DecisionTreeParams):
                             "used for learning each decision tree, in range 
(0, 1].",
                             typeConverter=TypeConverters.toFloat)
 
+    supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", 
"log2"]
+
+    featureSubsetStrategy = \
+        Param(Params._dummy(), "featureSubsetStrategy",
+              "The number of features to consider for splits at each tree 
node. Supported " +
+              "options: " + ", ".join(supportedFeatureSubsetStrategies) + ", 
(0.0-1.0], [1-n].",
+              typeConverter=TypeConverters.toString)
+
     def __init__(self):
         super(TreeEnsembleParams, self).__init__()
 
@@ -619,6 +627,22 @@ class TreeEnsembleParams(DecisionTreeParams):
         """
         return self.getOrDefault(self.subsamplingRate)
 
+    @since("1.4.0")
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+
+        .. note:: Deprecated in 2.4.0 and will be removed in 3.0.0.
+        """
+        return self._set(featureSubsetStrategy=value)
+
+    @since("1.4.0")
+    def getFeatureSubsetStrategy(self):
+        """
+        Gets the value of featureSubsetStrategy or its default value.
+        """
+        return self.getOrDefault(self.featureSubsetStrategy)
+
 
 class TreeRegressorParams(Params):
     """
@@ -654,14 +678,8 @@ class RandomForestParams(TreeEnsembleParams):
     Private class to track supported random forest parameters.
     """
 
-    supportedFeatureSubsetStrategies = ["auto", "all", "onethird", "sqrt", 
"log2"]
     numTrees = Param(Params._dummy(), "numTrees", "Number of trees to train 
(>= 1).",
                      typeConverter=TypeConverters.toInt)
-    featureSubsetStrategy = \
-        Param(Params._dummy(), "featureSubsetStrategy",
-              "The number of features to consider for splits at each tree 
node. Supported " +
-              "options: " + ", ".join(supportedFeatureSubsetStrategies) + ", 
(0.0-1.0], [1-n].",
-              typeConverter=TypeConverters.toString)
 
     def __init__(self):
         super(RandomForestParams, self).__init__()
@@ -680,20 +698,6 @@ class RandomForestParams(TreeEnsembleParams):
         """
         return self.getOrDefault(self.numTrees)
 
-    @since("1.4.0")
-    def setFeatureSubsetStrategy(self, value):
-        """
-        Sets the value of :py:attr:`featureSubsetStrategy`.
-        """
-        return self._set(featureSubsetStrategy=value)
-
-    @since("1.4.0")
-    def getFeatureSubsetStrategy(self):
-        """
-        Gets the value of featureSubsetStrategy or its default value.
-        """
-        return self.getOrDefault(self.featureSubsetStrategy)
-
 
 class GBTParams(TreeEnsembleParams):
     """
@@ -981,6 +985,13 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
     def _create_model(self, java_model):
         return RandomForestRegressionModel(java_model)
 
+    @since("2.4.0")
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+        """
+        return self._set(featureSubsetStrategy=value)
+
 
 class RandomForestRegressionModel(TreeEnsembleModel, JavaPredictionModel, 
JavaMLWritable,
                                   JavaMLReadable):
@@ -1029,6 +1040,8 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol,
     >>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42)
     >>> print(gbt.getImpurity())
     variance
+    >>> print(gbt.getFeatureSubsetStrategy())
+    all
     >>> model = gbt.fit(df)
     >>> model.featureImportances
     SparseVector(1, {0: 1.0})
@@ -1079,20 +1092,20 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol,
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                  maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                  checkpointInterval=10, lossType="squared", maxIter=20, 
stepSize=0.1, seed=None,
-                 impurity="variance"):
+                 impurity="variance", featureSubsetStrategy="all"):
         """
         __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                  maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
                  checkpointInterval=10, lossType="squared", maxIter=20, 
stepSize=0.1, seed=None, \
-                 impurity="variance")
+                 impurity="variance", featureSubsetStrategy="all")
         """
         super(GBTRegressor, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid)
         self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                          maxMemoryInMB=256, cacheNodeIds=False, 
subsamplingRate=1.0,
                          checkpointInterval=10, lossType="squared", 
maxIter=20, stepSize=0.1,
-                         impurity="variance")
+                         impurity="variance", featureSubsetStrategy="all")
         kwargs = self._input_kwargs
         self.setParams(**kwargs)
 
@@ -1102,13 +1115,13 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol,
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
                   maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0,
                   checkpointInterval=10, lossType="squared", maxIter=20, 
stepSize=0.1, seed=None,
-                  impuriy="variance"):
+                  impuriy="variance", featureSubsetStrategy="all"):
         """
         setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
                   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
                   maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \
                   checkpointInterval=10, lossType="squared", maxIter=20, 
stepSize=0.1, seed=None, \
-                  impurity="variance")
+                  impurity="variance", featureSubsetStrategy="all")
         Sets params for Gradient Boosted Tree Regression.
         """
         kwargs = self._input_kwargs
@@ -1131,6 +1144,13 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol,
         """
         return self.getOrDefault(self.lossType)
 
+    @since("2.4.0")
+    def setFeatureSubsetStrategy(self, value):
+        """
+        Sets the value of :py:attr:`featureSubsetStrategy`.
+        """
+        return self._set(featureSubsetStrategy=value)
+
 
 class GBTRegressionModel(TreeEnsembleModel, JavaPredictionModel, 
JavaMLWritable, JavaMLReadable):
     """


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-23161][PYSPARK][ML] Add missing APIs to Python GBTClassifier

Reply via email to