spark git commit: [SPARK-19806][ML][PYSPARK] PySpark GeneralizedLinearRegression supports tweedie distribution.

yliang Wed, 08 Mar 2017 02:09:56 -0800

Repository: spark
Updated Branches:
  refs/heads/master 1fa58868b -> 81303f7ca



[SPARK-19806][ML][PYSPARK] PySpark GeneralizedLinearRegression supports tweedie 
distribution.

## What changes were proposed in this pull request?
PySpark ```GeneralizedLinearRegression``` supports tweedie distribution.

## How was this patch tested?
Add unit tests.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #17146 from yanboliang/spark-19806.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/81303f7c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/81303f7c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/81303f7c

Branch: refs/heads/master
Commit: 81303f7ca7808d51229411dce8feeed8c23dbe15
Parents: 1fa5886
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Wed Mar 8 02:09:36 2017 -0800
Committer: Yanbo Liang <yblia...@gmail.com>
Committed: Wed Mar 8 02:09:36 2017 -0800

----------------------------------------------------------------------
 .../GeneralizedLinearRegression.scala           |  8 +--
 python/pyspark/ml/regression.py                 | 61 +++++++++++++++++---
 python/pyspark/ml/tests.py                      | 20 +++++++
 3 files changed, 77 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/81303f7c/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
index 110764d..3be8b53 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/GeneralizedLinearRegression.scala
@@ -66,7 +66,7 @@ private[regression] trait GeneralizedLinearRegressionBase 
extends PredictorParam
   /**
    * Param for the power in the variance function of the Tweedie distribution 
which provides
    * the relationship between the variance and mean of the distribution.
-   * Only applicable for the Tweedie family.
+   * Only applicable to the Tweedie family.
    * (see <a href="https://en.wikipedia.org/wiki/Tweedie_distribution";>
    * Tweedie Distribution (Wikipedia)</a>)
    * Supported values: 0 and [1, Inf).
@@ -79,7 +79,7 @@ private[regression] trait GeneralizedLinearRegressionBase 
extends PredictorParam
   final val variancePower: DoubleParam = new DoubleParam(this, "variancePower",
     "The power in the variance function of the Tweedie distribution which 
characterizes " +
     "the relationship between the variance and mean of the distribution. " +
-    "Only applicable for the Tweedie family. Supported values: 0 and [1, 
Inf).",
+    "Only applicable to the Tweedie family. Supported values: 0 and [1, Inf).",
     (x: Double) => x >= 1.0 || x == 0.0)
 
   /** @group getParam */
@@ -106,7 +106,7 @@ private[regression] trait GeneralizedLinearRegressionBase 
extends PredictorParam
   def getLink: String = $(link)
 
   /**
-   * Param for the index in the power link function. Only applicable for the 
Tweedie family.
+   * Param for the index in the power link function. Only applicable to the 
Tweedie family.
    * Note that link power 0, 1, -1 or 0.5 corresponds to the Log, Identity, 
Inverse or Sqrt
    * link, respectively.
    * When not set, this value defaults to 1 - [[variancePower]], which matches 
the R "statmod"
@@ -116,7 +116,7 @@ private[regression] trait GeneralizedLinearRegressionBase 
extends PredictorParam
    */
   @Since("2.2.0")
   final val linkPower: DoubleParam = new DoubleParam(this, "linkPower",
-    "The index in the power link function. Only applicable for the Tweedie 
family.")
+    "The index in the power link function. Only applicable to the Tweedie 
family.")
 
   /** @group getParam */
   @Since("2.2.0")

http://git-wip-us.apache.org/repos/asf/spark/blob/81303f7c/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index b199bf2..3c3fcc8 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -1294,8 +1294,8 @@ class GeneralizedLinearRegression(JavaEstimator, 
HasLabelCol, HasFeaturesCol, Ha
 
     Fit a Generalized Linear Model specified by giving a symbolic description 
of the linear
     predictor (link function) and a description of the error distribution 
(family). It supports
-    "gaussian", "binomial", "poisson" and "gamma" as family. Valid link 
functions for each family
-    is listed below. The first link function of each family is the default one.
+    "gaussian", "binomial", "poisson", "gamma" and "tweedie" as family. Valid 
link functions for
+    each family is listed below. The first link function of each family is the 
default one.
 
     * "gaussian" -> "identity", "log", "inverse"
 
@@ -1305,6 +1305,9 @@ class GeneralizedLinearRegression(JavaEstimator, 
HasLabelCol, HasFeaturesCol, Ha
 
     * "gamma"    -> "inverse", "identity", "log"
 
+    * "tweedie"  -> power link function specified through "linkPower". \
+                    The default link power in the tweedie family is 1 - 
variancePower.
+
     .. seealso:: `GLM 
<https://en.wikipedia.org/wiki/Generalized_linear_model>`_
 
     >>> from pyspark.ml.linalg import Vectors
@@ -1344,7 +1347,7 @@ class GeneralizedLinearRegression(JavaEstimator, 
HasLabelCol, HasFeaturesCol, Ha
 
     family = Param(Params._dummy(), "family", "The name of family which is a 
description of " +
                    "the error distribution to be used in the model. Supported 
options: " +
-                   "gaussian (default), binomial, poisson and gamma.",
+                   "gaussian (default), binomial, poisson, gamma and tweedie.",
                    typeConverter=TypeConverters.toString)
     link = Param(Params._dummy(), "link", "The name of link function which 
provides the " +
                  "relationship between the linear predictor and the mean of 
the distribution " +
@@ -1352,32 +1355,46 @@ class GeneralizedLinearRegression(JavaEstimator, 
HasLabelCol, HasFeaturesCol, Ha
                  "and sqrt.", typeConverter=TypeConverters.toString)
     linkPredictionCol = Param(Params._dummy(), "linkPredictionCol", "link 
prediction (linear " +
                               "predictor) column name", 
typeConverter=TypeConverters.toString)
+    variancePower = Param(Params._dummy(), "variancePower", "The power in the 
variance function " +
+                          "of the Tweedie distribution which characterizes the 
relationship " +
+                          "between the variance and mean of the distribution. 
Only applicable " +
+                          "for the Tweedie family. Supported values: 0 and [1, 
Inf).",
+                          typeConverter=TypeConverters.toFloat)
+    linkPower = Param(Params._dummy(), "linkPower", "The index in the power 
link function. " +
+                      "Only applicable to the Tweedie family.",
+                      typeConverter=TypeConverters.toFloat)
 
     @keyword_only
     def __init__(self, labelCol="label", featuresCol="features", 
predictionCol="prediction",
                  family="gaussian", link=None, fitIntercept=True, maxIter=25, 
tol=1e-6,
-                 regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None):
+                 regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None,
+                 variancePower=0.0, linkPower=None):
         """
         __init__(self, labelCol="label", featuresCol="features", 
predictionCol="prediction", \
                  family="gaussian", link=None, fitIntercept=True, maxIter=25, 
tol=1e-6, \
-                 regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None)
+                 regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None, \
+                 variancePower=0.0, linkPower=None)
         """
         super(GeneralizedLinearRegression, self).__init__()
         self._java_obj = self._new_java_obj(
             "org.apache.spark.ml.regression.GeneralizedLinearRegression", 
self.uid)
-        self._setDefault(family="gaussian", maxIter=25, tol=1e-6, 
regParam=0.0, solver="irls")
+        self._setDefault(family="gaussian", maxIter=25, tol=1e-6, 
regParam=0.0, solver="irls",
+                         variancePower=0.0)
         kwargs = self._input_kwargs
+
         self.setParams(**kwargs)
 
     @keyword_only
     @since("2.0.0")
     def setParams(self, labelCol="label", featuresCol="features", 
predictionCol="prediction",
                   family="gaussian", link=None, fitIntercept=True, maxIter=25, 
tol=1e-6,
-                  regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None):
+                  regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None,
+                  variancePower=0.0, linkPower=None):
         """
         setParams(self, labelCol="label", featuresCol="features", 
predictionCol="prediction", \
                   family="gaussian", link=None, fitIntercept=True, maxIter=25, 
tol=1e-6, \
-                  regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None)
+                  regParam=0.0, weightCol=None, solver="irls", 
linkPredictionCol=None, \
+                  variancePower=0.0, linkPower=None)
         Sets params for generalized linear regression.
         """
         kwargs = self._input_kwargs
@@ -1428,6 +1445,34 @@ class GeneralizedLinearRegression(JavaEstimator, 
HasLabelCol, HasFeaturesCol, Ha
         """
         return self.getOrDefault(self.link)
 
+    @since("2.2.0")
+    def setVariancePower(self, value):
+        """
+        Sets the value of :py:attr:`variancePower`.
+        """
+        return self._set(variancePower=value)
+
+    @since("2.2.0")
+    def getVariancePower(self):
+        """
+        Gets the value of variancePower or its default value.
+        """
+        return self.getOrDefault(self.variancePower)
+
+    @since("2.2.0")
+    def setLinkPower(self, value):
+        """
+        Sets the value of :py:attr:`linkPower`.
+        """
+        return self._set(linkPower=value)
+
+    @since("2.2.0")
+    def getLinkPower(self):
+        """
+        Gets the value of linkPower or its default value.
+        """
+        return self.getOrDefault(self.linkPower)
+
 
 class GeneralizedLinearRegressionModel(JavaModel, JavaPredictionModel, 
JavaMLWritable,
                                        JavaMLReadable):

http://git-wip-us.apache.org/repos/asf/spark/blob/81303f7c/python/pyspark/ml/tests.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index 3524160..f052f5b 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -1223,6 +1223,26 @@ class HashingTFTest(SparkSessionTestCase):
                                    ": expected " + str(expected[i]) + ", got " 
+ str(features[i]))
 
 
+class GeneralizedLinearRegressionTest(SparkSessionTestCase):
+
+    def test_tweedie_distribution(self):
+
+        df = self.spark.createDataFrame(
+            [(1.0, Vectors.dense(0.0, 0.0)),
+             (1.0, Vectors.dense(1.0, 2.0)),
+             (2.0, Vectors.dense(0.0, 0.0)),
+             (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])
+
+        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
+        model = glr.fit(df)
+        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 
0.3402], atol=1E-4))
+        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))
+
+        model2 = glr.setLinkPower(-1.0).fit(df)
+        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 
0.5], atol=1E-4))
+        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
+
+
 class ALSTest(SparkSessionTestCase):
 
     def test_storage_levels(self):


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-19806][ML][PYSPARK] PySpark GeneralizedLinearRegression supports tweedie distribution.

Reply via email to