This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ba5ee27 [SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python DecisionTreeParams to regression.py ba5ee27 is described below commit ba5ee277069ca3d11b80b97bbb7235db0d0f8ff9 Author: Huaxin Gao <huax...@us.ibm.com> AuthorDate: Thu Aug 15 10:21:26 2019 -0500 [SPARK-28243][PYSPARK][ML][FOLLOW-UP] Move Python DecisionTreeParams to regression.py ## What changes were proposed in this pull request? Leave ```shared.py``` untouched. Move Python ```DecisionTreeParams``` to ```regression.py``` ## How was this patch tested? Use existing tests Closes #25406 from huaxingao/spark-28243. Authored-by: Huaxin Gao <huax...@us.ibm.com> Signed-off-by: Sean Owen <sean.o...@databricks.com> --- python/pyspark/ml/classification.py | 5 +- python/pyspark/ml/param/_shared_params_code_gen.py | 41 ------------ python/pyspark/ml/param/shared.py | 54 ---------------- python/pyspark/ml/regression.py | 74 ++++++++++++++++++++++ 4 files changed, 77 insertions(+), 97 deletions(-) diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index ce65439..5a1202b 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -22,8 +22,9 @@ from multiprocessing.pool import ThreadPool from pyspark import since, keyword_only from pyspark.ml import Estimator, Model from pyspark.ml.param.shared import * -from pyspark.ml.regression import DecisionTreeModel, DecisionTreeRegressionModel, \ - GBTParams, HasVarianceImpurity, RandomForestParams, TreeEnsembleModel +from pyspark.ml.regression import DecisionTreeModel, DecisionTreeParams, \ + DecisionTreeRegressionModel, GBTParams, HasVarianceImpurity, RandomForestParams, \ + TreeEnsembleModel from pyspark.ml.util import * from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaParams from pyspark.ml.wrapper import JavaWrapper diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py index 1b0c8c5..ca2e4a0 100644 --- a/python/pyspark/ml/param/_shared_params_code_gen.py +++ b/python/pyspark/ml/param/_shared_params_code_gen.py @@ -174,45 +174,4 @@ if __name__ == "__main__": param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter) code.append(param_code + "\n" + _gen_param_code(name, doc, defaultValueStr)) - decisionTreeParams = [ - ("maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; " + - "depth 1 means 1 internal node + 2 leaf nodes.", "TypeConverters.toInt"), - ("maxBins", "Max number of bins for" + - " discretizing continuous features. Must be >=2 and >= number of categories for any" + - " categorical feature.", "TypeConverters.toInt"), - ("minInstancesPerNode", "Minimum number of instances each child must have after split. " + - "If a split causes the left or right child to have fewer than minInstancesPerNode, the " + - "split will be discarded as invalid. Should be >= 1.", "TypeConverters.toInt"), - ("minInfoGain", "Minimum information gain for a split to be considered at a tree node.", - "TypeConverters.toFloat"), - ("maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small," + - " then 1 node will be split per iteration, and its aggregates may exceed this size.", - "TypeConverters.toInt"), - ("cacheNodeIds", "If false, the algorithm will pass trees to executors to match " + - "instances with nodes. If true, the algorithm will cache node IDs for each instance. " + - "Caching can speed up training of deeper trees. Users can set how often should the " + - "cache be checkpointed or disable it by setting checkpointInterval.", - "TypeConverters.toBoolean")] - - decisionTreeCode = '''class DecisionTreeParams(Params): - """ - Mixin for Decision Tree parameters. - """ - - $dummyPlaceHolders - - def __init__(self): - super(DecisionTreeParams, self).__init__()''' - dtParamMethods = "" - dummyPlaceholders = "" - paramTemplate = """$name = Param($owner, "$name", "$doc", typeConverter=$typeConverterStr)""" - for name, doc, typeConverterStr in decisionTreeParams: - if typeConverterStr is None: - typeConverterStr = str(None) - variable = paramTemplate.replace("$name", name).replace("$doc", doc) \ - .replace("$typeConverterStr", typeConverterStr) - dummyPlaceholders += variable.replace("$owner", "Params._dummy()") + "\n " - dtParamMethods += _gen_param_code(name, doc, None) + "\n" - code.append(decisionTreeCode.replace("$dummyPlaceHolders", dummyPlaceholders) + "\n" + - dtParamMethods) print("\n\n\n".join(code)) diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py index 56d6190..9527ef6 100644 --- a/python/pyspark/ml/param/shared.py +++ b/python/pyspark/ml/param/shared.py @@ -747,57 +747,3 @@ class HasValidationIndicatorCol(Params): Gets the value of validationIndicatorCol or its default value. """ return self.getOrDefault(self.validationIndicatorCol) - - -class DecisionTreeParams(Params): - """ - Mixin for Decision Tree parameters. - """ - - maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", typeConverter=TypeConverters.toInt) - maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous features. Must be >=2 and >= number of categories for any categorical feature.", typeConverter=TypeConverters.toInt) - minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of instances each child must have after split. If a split causes the left or right child to have fewer than minInstancesPerNode, the split will be discarded as invalid. Should be >= 1.", typeConverter=TypeConverters.toInt) - minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split to be considered at a tree node.", typeConverter=TypeConverters.toFloat) - maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.", typeConverter=TypeConverters.toInt) - cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.", typeConverter=TypeConverters.toBoolean) - - - def __init__(self): - super(DecisionTreeParams, self).__init__() - - def getMaxDepth(self): - """ - Gets the value of maxDepth or its default value. - """ - return self.getOrDefault(self.maxDepth) - - def getMaxBins(self): - """ - Gets the value of maxBins or its default value. - """ - return self.getOrDefault(self.maxBins) - - def getMinInstancesPerNode(self): - """ - Gets the value of minInstancesPerNode or its default value. - """ - return self.getOrDefault(self.minInstancesPerNode) - - def getMinInfoGain(self): - """ - Gets the value of minInfoGain or its default value. - """ - return self.getOrDefault(self.minInfoGain) - - def getMaxMemoryInMB(self): - """ - Gets the value of maxMemoryInMB or its default value. - """ - return self.getOrDefault(self.maxMemoryInMB) - - def getCacheNodeIds(self): - """ - Gets the value of cacheNodeIds or its default value. - """ - return self.getOrDefault(self.cacheNodeIds) - diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 349130f..aca9e6f 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -584,6 +584,80 @@ class IsotonicRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable): return self._call_java("predictions") +class DecisionTreeParams(Params): + """ + Mixin for Decision Tree parameters. + """ + + maxDepth = Param(Params._dummy(), "maxDepth", "Maximum depth of the tree. (>= 0) E.g., " + + "depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.", + typeConverter=TypeConverters.toInt) + + maxBins = Param(Params._dummy(), "maxBins", "Max number of bins for discretizing continuous " + + "features. Must be >=2 and >= number of categories for any categorical " + + "feature.", typeConverter=TypeConverters.toInt) + + minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", "Minimum number of " + + "instances each child must have after split. If a split causes " + + "the left or right child to have fewer than " + + "minInstancesPerNode, the split will be discarded as invalid. " + + "Should be >= 1.", typeConverter=TypeConverters.toInt) + + minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information gain for a split " + + "to be considered at a tree node.", typeConverter=TypeConverters.toFloat) + + maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in MB allocated to " + + "histogram aggregation. If too small, then 1 node will be split per " + + "iteration, and its aggregates may exceed this size.", + typeConverter=TypeConverters.toInt) + + cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the algorithm will pass " + + "trees to executors to match instances with nodes. If true, the " + + "algorithm will cache node IDs for each instance. Caching can speed " + + "up training of deeper trees. Users can set how often should the cache " + + "be checkpointed or disable it by setting checkpointInterval.", + typeConverter=TypeConverters.toBoolean) + + def __init__(self): + super(DecisionTreeParams, self).__init__() + + def getMaxDepth(self): + """ + Gets the value of maxDepth or its default value. + """ + return self.getOrDefault(self.maxDepth) + + def getMaxBins(self): + """ + Gets the value of maxBins or its default value. + """ + return self.getOrDefault(self.maxBins) + + def getMinInstancesPerNode(self): + """ + Gets the value of minInstancesPerNode or its default value. + """ + return self.getOrDefault(self.minInstancesPerNode) + + def getMinInfoGain(self): + """ + Gets the value of minInfoGain or its default value. + """ + return self.getOrDefault(self.minInfoGain) + + def getMaxMemoryInMB(self): + """ + Gets the value of maxMemoryInMB or its default value. + """ + return self.getOrDefault(self.maxMemoryInMB) + + def getCacheNodeIds(self): + """ + Gets the value of cacheNodeIds or its default value. + """ + return self.getOrDefault(self.cacheNodeIds) + + class TreeEnsembleParams(DecisionTreeParams): """ Mixin for Decision Tree-based ensemble algorithms parameters. --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org