[spark] branch master updated: [SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python DecisionTreeParams to regression.py

srowen Thu, 15 Aug 2019 08:22:31 -0700

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new ba5ee27  [SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python 
DecisionTreeParams to regression.py
ba5ee27 is described below

commit ba5ee277069ca3d11b80b97bbb7235db0d0f8ff9
Author: Huaxin Gao <huax...@us.ibm.com>
AuthorDate: Thu Aug 15 10:21:26 2019 -0500

    [SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python DecisionTreeParams to 
regression.py
    
    ## What changes were proposed in this pull request?
    Leave ```shared.py``` untouched. Move Python ```DecisionTreeParams``` to 
```regression.py```
    
    ## How was this patch tested?
    Use existing tests
    
    Closes #25406 from huaxingao/spark-28243.
    
    Authored-by: Huaxin Gao <huax...@us.ibm.com>
    Signed-off-by: Sean Owen <sean.o...@databricks.com>
---
 python/pyspark/ml/classification.py                |  5 +-
 python/pyspark/ml/param/_shared_params_code_gen.py | 41 ------------
 python/pyspark/ml/param/shared.py                  | 54 ----------------
 python/pyspark/ml/regression.py                    | 74 ++++++++++++++++++++++
 4 files changed, 77 insertions(+), 97 deletions(-)

diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index ce65439..5a1202b 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -22,8 +22,9 @@ from multiprocessing.pool import ThreadPool
 from pyspark import since, keyword_only
 from pyspark.ml import Estimator, Model
 from pyspark.ml.param.shared import *
-from pyspark.ml.regression import DecisionTreeModel, 
DecisionTreeRegressionModel, \
-    GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel
+from pyspark.ml.regression import DecisionTreeModel, DecisionTreeParams, \
+    DecisionTreeRegressionModel, GBTParams, HasVarianceImpurity, 
RandomForestParams, \
+    TreeEnsembleModel
 from pyspark.ml.util import *
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams
 from pyspark.ml.wrapper import JavaWrapper
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 1b0c8c5..ca2e4a0 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -174,45 +174,4 @@ if __name__ == "__main__":
         param_code = _gen_param_header(name, doc, defaultValueStr, 
typeConverter)
         code.append(param_code + "\n" + _gen_param_code(name, doc, 
defaultValueStr))
 
-    decisionTreeParams = [
-        ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 
leaf node; " +
-         "depth 1 means 1 internal node + 2 leaf nodes.", 
"TypeConverters.toInt"),
-        ("maxBins", "Max number of bins for" +
-         " discretizing continuous features.  Must be >=2 and >= number of 
categories for any" +
-         " categorical feature.", "TypeConverters.toInt"),
-        ("minInstancesPerNode", "Minimum number of instances each child must 
have after split. " +
-         "If a split causes the left or right child to have fewer than 
minInstancesPerNode, the " +
-         "split will be discarded as invalid. Should be >= 1.", 
"TypeConverters.toInt"),
-        ("minInfoGain", "Minimum information gain for a split to be considered 
at a tree node.",
-         "TypeConverters.toFloat"),
-        ("maxMemoryInMB", "Maximum memory in MB allocated to histogram 
aggregation. If too small," +
-         " then 1 node will be split per iteration, and its aggregates may 
exceed this size.",
-         "TypeConverters.toInt"),
-        ("cacheNodeIds", "If false, the algorithm will pass trees to executors 
to match " +
-         "instances with nodes. If true, the algorithm will cache node IDs for 
each instance. " +
-         "Caching can speed up training of deeper trees. Users can set how 
often should the " +
-         "cache be checkpointed or disable it by setting checkpointInterval.",
-         "TypeConverters.toBoolean")]
-
-    decisionTreeCode = '''class DecisionTreeParams(Params):
-    """
-    Mixin for Decision Tree parameters.
-    """
-
-    $dummyPlaceHolders
-
-    def __init__(self):
-        super(DecisionTreeParams, self).__init__()'''
-    dtParamMethods = ""
-    dummyPlaceholders = ""
-    paramTemplate = """$name = Param($owner, "$name", "$doc", 
typeConverter=$typeConverterStr)"""
-    for name, doc, typeConverterStr in decisionTreeParams:
-        if typeConverterStr is None:
-            typeConverterStr = str(None)
-        variable = paramTemplate.replace("$name", name).replace("$doc", doc) \
-            .replace("$typeConverterStr", typeConverterStr)
-        dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + 
"\n    "
-        dtParamMethods += _gen_param_code(name, doc, None) + "\n"
-    code.append(decisionTreeCode.replace("$dummyPlaceHolders", 
dummyPlaceholders) + "\n" +
-                dtParamMethods)
     print("\n\n\n".join(code))
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 56d6190..9527ef6 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -747,57 +747,3 @@ class HasValidationIndicatorCol(Params):
         Gets the value of validationIndicatorCol or its default value.
         """
         return self.getOrDefault(self.validationIndicatorCol)
-
-
-class DecisionTreeParams(Params):
-    """
-    Mixin for Decision Tree parameters.
-    """
-
-    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. 
(>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf 
nodes.", typeConverter=TypeConverters.toInt)
-    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for 
discretizing continuous features.  Must be >=2 and >= number of categories for 
any categorical feature.", typeConverter=TypeConverters.toInt)
-    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", 
"Minimum number of instances each child must have after split. If a split 
causes the left or right child to have fewer than minInstancesPerNode, the 
split will be discarded as invalid. Should be >= 1.", 
typeConverter=TypeConverters.toInt)
-    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information 
gain for a split to be considered at a tree node.", 
typeConverter=TypeConverters.toFloat)
-    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation. If too small, then 1 node will be split 
per iteration, and its aggregates may exceed this size.", 
typeConverter=TypeConverters.toInt)
-    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees. Users can set how often should the cache be 
checkpointed or disable it by setting checkpointInterval.", 
typeConverter=TypeConverters.toBoolean)
-    
-
-    def __init__(self):
-        super(DecisionTreeParams, self).__init__()
-
-    def getMaxDepth(self):
-        """
-        Gets the value of maxDepth or its default value.
-        """
-        return self.getOrDefault(self.maxDepth)
-
-    def getMaxBins(self):
-        """
-        Gets the value of maxBins or its default value.
-        """
-        return self.getOrDefault(self.maxBins)
-
-    def getMinInstancesPerNode(self):
-        """
-        Gets the value of minInstancesPerNode or its default value.
-        """
-        return self.getOrDefault(self.minInstancesPerNode)
-
-    def getMinInfoGain(self):
-        """
-        Gets the value of minInfoGain or its default value.
-        """
-        return self.getOrDefault(self.minInfoGain)
-
-    def getMaxMemoryInMB(self):
-        """
-        Gets the value of maxMemoryInMB or its default value.
-        """
-        return self.getOrDefault(self.maxMemoryInMB)
-
-    def getCacheNodeIds(self):
-        """
-        Gets the value of cacheNodeIds or its default value.
-        """
-        return self.getOrDefault(self.cacheNodeIds)
-
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 349130f..aca9e6f 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -584,6 +584,80 @@ class IsotonicRegressionModel(JavaModel, JavaMLWritable, 
JavaMLReadable):
         return self._call_java("predictions")
 
 
+class DecisionTreeParams(Params):
+    """
+    Mixin for Decision Tree parameters.
+    """
+
+    maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. 
(>= 0) E.g., " +
+                     "depth 0 means 1 leaf node; depth 1 means 1 internal node 
+ 2 leaf nodes.",
+                     typeConverter=TypeConverters.toInt)
+
+    maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for 
discretizing continuous " +
+                    "features.  Must be >=2 and >= number of categories for 
any categorical " +
+                    "feature.", typeConverter=TypeConverters.toInt)
+
+    minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", 
"Minimum number of " +
+                                "instances each child must have after split. 
If a split causes " +
+                                "the left or right child to have fewer than " +
+                                "minInstancesPerNode, the split will be 
discarded as invalid. " +
+                                "Should be >= 1.", 
typeConverter=TypeConverters.toInt)
+
+    minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information 
gain for a split " +
+                        "to be considered at a tree node.", 
typeConverter=TypeConverters.toFloat)
+
+    maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in 
MB allocated to " +
+                          "histogram aggregation. If too small, then 1 node 
will be split per " +
+                          "iteration, and its aggregates may exceed this 
size.",
+                          typeConverter=TypeConverters.toInt)
+
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass " +
+                         "trees to executors to match instances with nodes. If 
true, the " +
+                         "algorithm will cache node IDs for each instance. 
Caching can speed " +
+                         "up training of deeper trees. Users can set how often 
should the cache " +
+                         "be checkpointed or disable it by setting 
checkpointInterval.",
+                         typeConverter=TypeConverters.toBoolean)
+
+    def __init__(self):
+        super(DecisionTreeParams, self).__init__()
+
+    def getMaxDepth(self):
+        """
+        Gets the value of maxDepth or its default value.
+        """
+        return self.getOrDefault(self.maxDepth)
+
+    def getMaxBins(self):
+        """
+        Gets the value of maxBins or its default value.
+        """
+        return self.getOrDefault(self.maxBins)
+
+    def getMinInstancesPerNode(self):
+        """
+        Gets the value of minInstancesPerNode or its default value.
+        """
+        return self.getOrDefault(self.minInstancesPerNode)
+
+    def getMinInfoGain(self):
+        """
+        Gets the value of minInfoGain or its default value.
+        """
+        return self.getOrDefault(self.minInfoGain)
+
+    def getMaxMemoryInMB(self):
+        """
+        Gets the value of maxMemoryInMB or its default value.
+        """
+        return self.getOrDefault(self.maxMemoryInMB)
+
+    def getCacheNodeIds(self):
+        """
+        Gets the value of cacheNodeIds or its default value.
+        """
+        return self.getOrDefault(self.cacheNodeIds)
+
+
 class TreeEnsembleParams(DecisionTreeParams):
     """
     Mixin for Decision Tree-based ensemble algorithms parameters.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python DecisionTreeParams to regression.py

Reply via email to