Repository: spark Updated Branches: refs/heads/master 14e75758a -> 845c039ce
[SPARK-20601][ML] Python API for Constrained Logistic Regression ## What changes were proposed in this pull request? Python API for Constrained Logistic Regression based on #17922 , thanks for the original contribution from zero323 . ## How was this patch tested? Unit tests. Author: zero323 <zero...@users.noreply.github.com> Author: Yanbo Liang <yblia...@gmail.com> Closes #18759 from yanboliang/SPARK-20601. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/845c039c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/845c039c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/845c039c Branch: refs/heads/master Commit: 845c039ceb1662632a97631b110e875e934894ad Parents: 14e7575 Author: zero323 <zero...@users.noreply.github.com> Authored: Wed Aug 2 18:10:26 2017 +0800 Committer: Yanbo Liang <yblia...@gmail.com> Committed: Wed Aug 2 18:10:26 2017 +0800 ---------------------------------------------------------------------- python/pyspark/ml/classification.py | 105 +++++++++++++++++++++++++++++-- python/pyspark/ml/param/__init__.py | 11 +++- python/pyspark/ml/tests.py | 37 +++++++++++ 3 files changed, 148 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/845c039c/python/pyspark/ml/classification.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index ab1617b..bccf8e7 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -252,18 +252,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti "be used in the model. Supported options: auto, binomial, multinomial", typeConverter=TypeConverters.toString) + lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients", + "The lower bounds on coefficients if fitting under bound " + "constrained optimization. The bound matrix must be " + "compatible with the shape " + "(1, number of features) for binomial regression, or " + "(number of classes, number of features) " + "for multinomial regression.", + typeConverter=TypeConverters.toMatrix) + + upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients", + "The upper bounds on coefficients if fitting under bound " + "constrained optimization. The bound matrix must be " + "compatible with the shape " + "(1, number of features) for binomial regression, or " + "(number of classes, number of features) " + "for multinomial regression.", + typeConverter=TypeConverters.toMatrix) + + lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts", + "The lower bounds on intercepts if fitting under bound " + "constrained optimization. The bounds vector size must be" + "equal with 1 for binomial regression, or the number of" + "lasses for multinomial regression.", + typeConverter=TypeConverters.toVector) + + upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts", + "The upper bounds on intercepts if fitting under bound " + "constrained optimization. The bound vector size must be " + "equal with 1 for binomial regression, or the number of " + "classes for multinomial regression.", + typeConverter=TypeConverters.toVector) + @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2, family="auto"): + aggregationDepth=2, family="auto", + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): + """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2, family="auto") + aggregationDepth=2, family="auto", \ + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() @@ -280,13 +317,17 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2, family="auto"): + aggregationDepth=2, family="auto", + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2, family="auto") + aggregationDepth=2, family="auto", \ + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): Sets params for logistic regression. If the threshold and thresholds Params are both set, they must be equivalent. """ @@ -381,6 +422,62 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti """ return self.getOrDefault(self.family) + @since("2.3.0") + def setLowerBoundsOnCoefficients(self, value): + """ + Sets the value of :py:attr:`lowerBoundsOnCoefficients` + """ + return self._set(lowerBoundsOnCoefficients=value) + + @since("2.3.0") + def getLowerBoundsOnCoefficients(self): + """ + Gets the value of :py:attr:`lowerBoundsOnCoefficients` + """ + return self.getOrDefault(self.lowerBoundsOnCoefficients) + + @since("2.3.0") + def setUpperBoundsOnCoefficients(self, value): + """ + Sets the value of :py:attr:`upperBoundsOnCoefficients` + """ + return self._set(upperBoundsOnCoefficients=value) + + @since("2.3.0") + def getUpperBoundsOnCoefficients(self): + """ + Gets the value of :py:attr:`upperBoundsOnCoefficients` + """ + return self.getOrDefault(self.upperBoundsOnCoefficients) + + @since("2.3.0") + def setLowerBoundsOnIntercepts(self, value): + """ + Sets the value of :py:attr:`lowerBoundsOnIntercepts` + """ + return self._set(lowerBoundsOnIntercepts=value) + + @since("2.3.0") + def getLowerBoundsOnIntercepts(self): + """ + Gets the value of :py:attr:`lowerBoundsOnIntercepts` + """ + return self.getOrDefault(self.lowerBoundsOnIntercepts) + + @since("2.3.0") + def setUpperBoundsOnIntercepts(self, value): + """ + Sets the value of :py:attr:`upperBoundsOnIntercepts` + """ + return self._set(upperBoundsOnIntercepts=value) + + @since("2.3.0") + def getUpperBoundsOnIntercepts(self): + """ + Gets the value of :py:attr:`upperBoundsOnIntercepts` + """ + return self.getOrDefault(self.upperBoundsOnIntercepts) + class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): """ http://git-wip-us.apache.org/repos/asf/spark/blob/845c039c/python/pyspark/ml/param/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 99d8fa3..4583ae8 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -27,7 +27,7 @@ import numpy as np from py4j.java_gateway import JavaObject -from pyspark.ml.linalg import DenseVector, Vector +from pyspark.ml.linalg import DenseVector, Vector, Matrix from pyspark.ml.util import Identifiable @@ -170,6 +170,15 @@ class TypeConverters(object): raise TypeError("Could not convert %s to vector" % value) @staticmethod + def toMatrix(value): + """ + Convert a value to a MLlib Matrix, if possible. + """ + if isinstance(value, Matrix): + return value + raise TypeError("Could not convert %s to matrix" % value) + + @staticmethod def toFloat(value): """ Convert a value to a float, if possible. http://git-wip-us.apache.org/repos/asf/spark/blob/845c039c/python/pyspark/ml/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a9ca346..7ee2c2f 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -1459,6 +1459,43 @@ class GeneralizedLinearRegressionTest(SparkSessionTestCase): self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4)) +class LogisticRegressionTest(SparkSessionTestCase): + + def test_binomial_logistic_regression_with_bound(self): + + df = self.spark.createDataFrame( + [(1.0, 1.0, Vectors.dense(0.0, 5.0)), + (0.0, 2.0, Vectors.dense(1.0, 2.0)), + (1.0, 3.0, Vectors.dense(2.0, 1.0)), + (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"]) + + lor = LogisticRegression(regParam=0.01, weightCol="weight", + lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]), + upperBoundsOnIntercepts=Vectors.dense(0.0)) + model = lor.fit(df) + self.assertTrue( + np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4)) + self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4)) + + def test_multinomial_logistic_regression_with_bound(self): + + data_path = "data/mllib/sample_multiclass_classification_data.txt" + df = self.spark.read.format("libsvm").load(data_path) + + lor = LogisticRegression(regParam=0.01, + lowerBoundsOnCoefficients=Matrices.dense(3, 4, range(12)), + upperBoundsOnIntercepts=Vectors.dense(0.0, 0.0, 0.0)) + model = lor.fit(df) + expected = [[4.593, 4.5516, 9.0099, 12.2904], + [1.0, 8.1093, 7.0, 10.0], + [3.041, 5.0, 8.0, 11.0]] + for i in range(0, len(expected)): + self.assertTrue( + np.allclose(model.coefficientMatrix.toArray()[i], expected[i], atol=1E-4)) + self.assertTrue( + np.allclose(model.interceptVector.toArray(), [-0.9057, -1.1392, -0.0033], atol=1E-4)) + + class FPGrowthTests(SparkSessionTestCase): def setUp(self): super(FPGrowthTests, self).setUp() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org