[GitHub] spark pull request: [MLLIB] SPARK-1547: Adding Gradient Boosting t...

jkbradley Tue, 28 Oct 2014 12:43:38 -0700

Github user jkbradley commented on a diff in the pull request:

    https://github.com/apache/spark/pull/2607#discussion_r19497888
  
    --- Diff: 
mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoosting.scala ---
    @@ -0,0 +1,433 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.spark.mllib.tree
    +
    +import scala.collection.JavaConverters._
    +
    +import org.apache.spark.annotation.Experimental
    +import org.apache.spark.api.java.JavaRDD
    +import org.apache.spark.mllib.tree.configuration.BoostingStrategy
    +import org.apache.spark.Logging
    +import org.apache.spark.mllib.tree.impl.TimeTracker
    +import org.apache.spark.mllib.tree.loss.Losses
    +import org.apache.spark.rdd.RDD
    +import org.apache.spark.mllib.regression.LabeledPoint
    +import org.apache.spark.mllib.tree.model.{WeightedEnsembleModel, 
DecisionTreeModel}
    +import org.apache.spark.mllib.tree.configuration.Algo._
    +import org.apache.spark.storage.StorageLevel
    +import 
org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy.Sum
    +
    +/**
    + * :: Experimental ::
    + * A class that implements gradient boosting for regression problems.
    + * @param boostingStrategy Parameters for the gradient boosting algorithm
    + */
    +@Experimental
    +class GradientBoosting (
    +    private val boostingStrategy: BoostingStrategy) extends Serializable 
with Logging {
    +
    +  /**
    +   * Method to train a gradient boosting model
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def train(input: RDD[LabeledPoint]): WeightedEnsembleModel = {
    +    val algo = boostingStrategy.algo
    +    algo match {
    +      case Regression => GradientBoosting.boost(input, boostingStrategy)
    +      case Classification =>
    +        val remappedInput = input.map(x => new LabeledPoint((x.label * 2) 
- 1, x.features))
    +        GradientBoosting.boost(remappedInput, boostingStrategy)
    +      case _ =>
    +        throw new IllegalArgumentException(s"$algo is not supported by the 
gradient boosting.")
    +    }
    +  }
    +
    +}
    +
    +
    +object GradientBoosting extends Logging {
    +
    +  /**
    +   * Method to train a gradient boosting model.
    +   *
    +   * Note: Using 
[[org.apache.spark.mllib.tree.GradientBoosting#trainRegressor]]
    +   *       is recommended to clearly specify regression.
    +   *       Using 
[[org.apache.spark.mllib.tree.GradientBoosting#trainClassifier]]
    +   *       is recommended to clearly specify regression.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param boostingStrategy Configuration options for the boosting 
algorithm.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def train(
    +      input: RDD[LabeledPoint],
    +      boostingStrategy: BoostingStrategy): WeightedEnsembleModel = {
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Method to train a gradient boosting regression model.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param numEstimators Number of estimators used in boosting stages. In 
other words,
    +   *                      number of boosting iterations performed.
    +   * @param loss Loss function used for minimization during gradient 
boosting.
    +   * @param maxDepth Maximum depth of the tree.
    +   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 
internal node + 2 leaf nodes.
    +   * @param learningRate Learning rate for shrinking the contribution of 
each estimator. The
    +   *                     learning rate should be between in the interval 
(0, 1]
    +   * @param subsample  Fraction of the training data used for learning the 
decision tree.
    +   * @param checkpointPeriod Checkpointing the dataset in memory to avoid 
long lineage chains.
    +   * @param categoricalFeaturesInfo A map storing information about the 
categorical variables and
    +   *                                the number of discrete values they 
take. For example,
    +   *                                an entry (n -> k) implies the feature 
n is categorical with k
    +   *                                categories 0, 1, 2, ... , k-1. It's 
important to note that
    +   *                                features are zero-indexed.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainRegressor(
    +      input: RDD[LabeledPoint],
    +      numEstimators: Int,
    +      loss: String,
    +      maxDepth: Int,
    +      learningRate: Double,
    +      subsample: Double,
    +      checkpointPeriod: Int,
    +      categoricalFeaturesInfo: Map[Int, Int]): WeightedEnsembleModel = {
    +    val lossType = Losses.fromString(loss)
    +    val boostingStrategy = new BoostingStrategy(Regression, numEstimators, 
lossType,
    +      maxDepth, learningRate, subsample, checkpointPeriod, 2,
    +      categoricalFeaturesInfo = categoricalFeaturesInfo)
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +
    +  /**
    +   * Method to train a gradient boosting binary classification model.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param numEstimators Number of estimators used in boosting stages. In 
other words,
    +   *                      number of boosting iterations performed.
    +   * @param loss Loss function used for minimization during gradient 
boosting.
    +   * @param maxDepth Maximum depth of the tree.
    +   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 
internal node + 2 leaf nodes.
    +   * @param learningRate Learning rate for shrinking the contribution of 
each estimator. The
    +   *                     learning rate should be between in the interval 
(0, 1]
    +   * @param subsample  Fraction of the training data used for learning the 
decision tree.
    +   * @param checkpointPeriod Checkpointing the dataset in memory to avoid 
long lineage chains.
    +   * @param numClassesForClassification Number of classes for 
classification.
    +   *                                    (Ignored for regression.)
    +   *                                    Default value is 2 (binary 
classification).
    +   * @param categoricalFeaturesInfo A map storing information about the 
categorical variables and
    +   *                                the number of discrete values they 
take. For example,
    +   *                                an entry (n -> k) implies the feature 
n is categorical with k
    +   *                                categories 0, 1, 2, ... , k-1. It's 
important to note that
    +   *                                features are zero-indexed.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainClassifier(
    +      input: RDD[LabeledPoint],
    +      numEstimators: Int,
    +      loss: String,
    +      maxDepth: Int,
    +      learningRate: Double,
    +      subsample: Double,
    +      checkpointPeriod: Int,
    +      numClassesForClassification: Int,
    +      categoricalFeaturesInfo: Map[Int, Int]): WeightedEnsembleModel = {
    +    val lossType = Losses.fromString(loss)
    +    val boostingStrategy = new BoostingStrategy(Regression, numEstimators, 
lossType,
    +      maxDepth, learningRate, subsample, checkpointPeriod, 
numClassesForClassification,
    +      categoricalFeaturesInfo = categoricalFeaturesInfo)
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Java-friendly API for 
[[org.apache.spark.mllib.tree.GradientBoosting#trainRegressor]]
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param numEstimators Number of estimators used in boosting stages. In 
other words,
    +   *                      number of boosting iterations performed.
    +   * @param loss Loss function used for minimization during gradient 
boosting.
    +   * @param maxDepth Maximum depth of the tree.
    +   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 
internal node + 2 leaf nodes.
    +   * @param learningRate Learning rate for shrinking the contribution of 
each estimator. The
    +   *                     learning rate should be between in the interval 
(0, 1]
    +   * @param subsample  Fraction of the training data used for learning the 
decision tree.
    +   * @param checkpointPeriod Checkpointing the dataset in memory to avoid 
long lineage chains.
    +   * @param categoricalFeaturesInfo A map storing information about the 
categorical variables and
    +   *                                the number of discrete values they 
take. For example,
    +   *                                an entry (n -> k) implies the feature 
n is categorical with k
    +   *                                categories 0, 1, 2, ... , k-1. It's 
important to note that
    +   *                                features are zero-indexed.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainRegressor(
    +      input: JavaRDD[LabeledPoint],
    +      numEstimators: Int,
    +      loss: String,
    +      maxDepth: Int,
    +      learningRate: Double,
    +      subsample: Double,
    +      checkpointPeriod: Int,
    +      categoricalFeaturesInfo: java.util.Map[java.lang.Integer, 
java.lang.Integer])
    +      : WeightedEnsembleModel = {
    +    val lossType = Losses.fromString(loss)
    +    val boostingStrategy = new BoostingStrategy(Regression, numEstimators, 
lossType,
    +      maxDepth, learningRate, subsample, checkpointPeriod, 2, 
categoricalFeaturesInfo =
    +        categoricalFeaturesInfo.asInstanceOf[java.util.Map[Int, 
Int]].asScala.toMap)
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Java-friendly API for 
[[org.apache.spark.mllib.tree.GradientBoosting#trainClassifier]]
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param numEstimators Number of estimators used in boosting stages. In 
other words,
    +   *                      number of boosting iterations performed.
    +   * @param loss Loss function used for minimization during gradient 
boosting.
    +   * @param maxDepth Maximum depth of the tree.
    +   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 
internal node + 2 leaf nodes.
    +   * @param learningRate Learning rate for shrinking the contribution of 
each estimator. The
    +   *                     learning rate should be between in the interval 
(0, 1]
    +   * @param subsample  Fraction of the training data used for learning the 
decision tree.
    +   * @param checkpointPeriod Checkpointing the dataset in memory to avoid 
long lineage chains.
    +   * @param numClassesForClassification Number of classes for 
classification.
    +   *                                    (Ignored for regression.)
    +   *                                    Default value is 2 (binary 
classification).
    +   * @param categoricalFeaturesInfo A map storing information about the 
categorical variables and
    +   *                                the number of discrete values they 
take. For example,
    +   *                                an entry (n -> k) implies the feature 
n is categorical with k
    +   *                                categories 0, 1, 2, ... , k-1. It's 
important to note that
    +   *                                features are zero-indexed.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainClassifier(
    +      input: JavaRDD[LabeledPoint],
    +      numEstimators: Int,
    +      loss: String,
    +      maxDepth: Int,
    +      learningRate: Double,
    +      subsample: Double,
    +      checkpointPeriod: Int,
    +      numClassesForClassification: Int,
    +      categoricalFeaturesInfo: java.util.Map[java.lang.Integer, 
java.lang.Integer])
    +      : WeightedEnsembleModel = {
    +    val lossType = Losses.fromString(loss)
    +    val boostingStrategy = new BoostingStrategy(Regression, numEstimators, 
lossType,
    +      maxDepth, learningRate, subsample, checkpointPeriod,
    +      numClassesForClassification, categoricalFeaturesInfo =
    +      categoricalFeaturesInfo.asInstanceOf[java.util.Map[Int, 
Int]].asScala.toMap)
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Method to train a gradient boosting regression model.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param numEstimators Number of estimators used in boosting stages. In 
other words,
    +   *                      number of boosting iterations performed.
    +   * @param loss Loss function used for minimization during gradient 
boosting.
    +   * @param maxDepth Maximum depth of the tree.
    +   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 
internal node + 2 leaf nodes.
    +   * @param learningRate Learning rate for shrinking the contribution of 
each estimator. The
    +   *                     learning rate should be between in the interval 
(0, 1]
    +   * @param subsample  Fraction of the training data used for learning the 
decision tree.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainRegressor(
    +      input: RDD[LabeledPoint],
    +      numEstimators: Int,
    +      loss: String,
    +      maxDepth: Int,
    +      learningRate: Double,
    +      subsample: Double): WeightedEnsembleModel = {
    +    val lossType = Losses.fromString(loss)
    +    val boostingStrategy = new BoostingStrategy(Regression, numEstimators, 
lossType,
    +      maxDepth, learningRate, subsample)
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Method to train a gradient boosting binary classification model.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param numEstimators Number of estimators used in boosting stages. In 
other words,
    +   *                      number of boosting iterations performed.
    +   * @param loss Loss function used for minimization during gradient 
boosting.
    +   * @param maxDepth Maximum depth of the tree.
    +   *                 E.g., depth 0 means 1 leaf node; depth 1 means 1 
internal node + 2 leaf nodes.
    +   * @param learningRate Learning rate for shrinking the contribution of 
each estimator. The
    +   *                     learning rate should be between in the interval 
(0, 1]
    +   * @param subsample  Fraction of the training data used for learning the 
decision tree.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainClassifier(
    +      input: RDD[LabeledPoint],
    +      numEstimators: Int,
    +      loss: String,
    +      maxDepth: Int,
    +      learningRate: Double,
    +      subsample: Double): WeightedEnsembleModel = {
    +    val lossType = Losses.fromString(loss)
    +    val boostingStrategy = new BoostingStrategy(Regression, numEstimators, 
lossType,
    +      maxDepth, learningRate, subsample)
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Method to train a gradient boosting regression model.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param boostingStrategy Configuration options for the boosting 
algorithm.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainRegressor(
    +      input: RDD[LabeledPoint],
    +      boostingStrategy: BoostingStrategy): WeightedEnsembleModel = {
    +    val algo = boostingStrategy.algo
    +    require(algo == Regression, s"Only Regression algo supported. Provided 
algo is $algo.")
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Method to train a gradient boosting classification model.
    +   *
    +   * @param input Training dataset: RDD of 
[[org.apache.spark.mllib.regression.LabeledPoint]].
    +   *              For classification, labels should take values {0, 1, 
..., numClasses-1}.
    +   *              For regression, labels are real numbers.
    +   * @param boostingStrategy Configuration options for the boosting 
algorithm.
    +   * @return GradientBoostingModel that can be used for prediction
    +   */
    +  def trainClassifier(
    +      input: RDD[LabeledPoint],
    +      boostingStrategy: BoostingStrategy): WeightedEnsembleModel = {
    +    val algo = boostingStrategy.algo
    +    require(algo == Classification, s"Only Classification algo supported. 
Provided algo is $algo.")
    +    new GradientBoosting(boostingStrategy).train(input)
    +  }
    +
    +  /**
    +   * Internal method for performing regression using trees as base 
learners.
    +   * @param input training dataset
    +   * @param boostingStrategy boosting parameters
    +   * @return
    +   */
    +  private def boost(
    +      input: RDD[LabeledPoint],
    +      boostingStrategy: BoostingStrategy): WeightedEnsembleModel = {
    +
    +    val timer = new TimeTracker()
    +
    +    timer.start("total")
    +    timer.start("init")
    +
    +
    +    // Initialize gradient boosting parameters
    +    val numEstimators = boostingStrategy.numEstimators
    +    val baseLearners = new Array[DecisionTreeModel](numEstimators)
    +    val baseLearnerWeights = new Array[Double](numEstimators)
    +    val loss = boostingStrategy.loss
    +    val learningRate = boostingStrategy.learningRate
    +    val checkpointingPeriod = boostingStrategy.checkpointPeriod
    --- End diff --
    
    Use same name: "checkpointingPeriod" or "checkpointPeriod"



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] spark pull request: [MLLIB] SPARK-1547: Adding Gradient Boosting t...

Reply via email to