Github user sethah commented on a diff in the pull request:

    https://github.com/apache/spark/pull/15721#discussion_r93420640
  
    --- Diff: 
mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala ---
    @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite {
         }.toDF()
         (overSampledData, weightedData)
       }
    +
    +  /**
    +   * Generates a linear prediction function where the coefficients are 
generated randomly.
    +   * The function produces a continuous (numClasses = 0) or categorical 
(numClasses > 0) label.
    +   */
    +  def getRandomLinearPredictionFunction(
    +      numFeatures: Int,
    +      numClasses: Int,
    +      seed: Long): (Vector => Double) = {
    +    val rng = new scala.util.Random(seed)
    +    val trueNumClasses = if (numClasses == 0) 1 else numClasses
    +    val coefArray = Array.fill(numFeatures * 
trueNumClasses)(rng.nextDouble - 0.5)
    +    (features: Vector) => {
    +      if (numClasses == 0) {
    +        BLAS.dot(features, new DenseVector(coefArray))
    +      } else {
    +        val margins = new DenseVector(new Array[Double](numClasses))
    +        val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray)
    +        BLAS.gemv(1.0, coefMat, features, 1.0, margins)
    +        margins.argmax.toDouble
    +      }
    +    }
    +  }
    +
    +  /**
    +   * A helper function to generate synthetic data. Generates random 
feature values,
    +   * both categorical and continuous, according to 
`categoricalFeaturesInfo`. The label is generated
    +   * from a random prediction function, and noise is added to the true 
label.
    +   *
    +   * @param numPoints The number of data points to generate.
    +   * @param numClasses The number of classes the outcome can take on. 0 
for continuous labels.
    +   * @param numFeatures The number of features in the data.
    +   * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) 
for categorical features.
    +   * @param seed Random seed.
    +   * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to 
add to the label.
    +   * @return Generated sequence of noisy instances.
    +   */
    +  def generateNoisyData(
    +      numPoints: Int,
    +      numClasses: Int,
    +      numFeatures: Int,
    +      categoricalFeaturesInfo: Map[Int, Int],
    +      seed: Long,
    +      noiseLevel: Double = 0.3): Seq[Instance] = {
    +    require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in 
range [0.0, 1.0]")
    +    val rng = new scala.util.Random(seed)
    +    val predictionFunc = getRandomLinearPredictionFunction(numFeatures, 
numClasses, seed)
    +    Range(0, numPoints).map { i =>
    +      val features = Vectors.dense(Array.tabulate(numFeatures) { j =>
    +        val numCategories = categoricalFeaturesInfo.getOrElse(j, 0)
    +        if (numCategories > 0) {
    +          rng.nextInt(numCategories)
    +        } else {
    +          rng.nextDouble() - 0.5
    +        }
    +      })
    +      val label = predictionFunc(features)
    +      val noisyLabel = if (numClasses > 0) {
    +        // with probability equal to noiseLevel, select a random class 
instead of the true class
    +        if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label
    +      } else {
    +        // add noise to the label proportional to the noise level
    +        label + noiseLevel * rng.nextGaussian()
    +      }
    +      Instance(noisyLabel, 1.0, features)
    +    }
    +  }
    +
    +  /**
    +   * Helper function for testing sample weights. Tests that oversampling 
each point is equivalent
    +   * to assigning a sample weight proportional to the number of samples 
for each point.
    +   */
    +  def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]](
    +        spark: SparkSession,
    +        estimator: E with HasWeightCol with HasLabelCol with 
HasFeaturesCol,
    +        categoricalFeaturesInfo: Map[Int, Int],
    +        numPoints: Int,
    +        numClasses: Int,
    +        numFeatures: Int,
    +        modelEquals: (M, M) => Unit,
    +        seed: Long): Unit = {
    +    import spark.implicits._
    +    val df = generateNoisyData(numPoints, numClasses, numFeatures, 
categoricalFeaturesInfo,
    +      seed).toDF()
    +    val (overSampledData, weightedData) = 
genEquivalentOversampledAndWeightedInstances(
    +      df, estimator.getLabelCol, estimator.getFeaturesCol, seed)
    +    val weightedModel = estimator.set(estimator.weightCol, 
"weight").fit(weightedData)
    +    val overSampledModel = estimator.set(estimator.weightCol, 
"").fit(overSampledData)
    +    modelEquals(weightedModel, overSampledModel)
    +  }
    +
    +  /**
    +   * Helper function for testing sample weights. Tests that injecting a 
large number of outliers
    +   * with very small sample weights does not affect fitting. The predictor 
should learn the true
    +   * model despite the outliers.
    +   */
    +  def testOutliersWithSmallWeights[M <: Model[M], E <: Estimator[M]](
    +        spark: SparkSession,
    +        estimator: E with HasWeightCol with HasLabelCol with 
HasFeaturesCol,
    +        categoricalFeaturesInfo: Map[Int, Int],
    +        numPoints: Int,
    +        numClasses: Int,
    +        numFeatures: Int,
    +        modelEquals: (M, M) => Unit,
    +        seed: Long): Unit = {
    +    import spark.implicits._
    +    val df = generateNoisyData(numPoints, numClasses, numFeatures, 
categoricalFeaturesInfo,
    +      seed).toDF()
    +    val outlierFunction = getRandomLinearPredictionFunction(numFeatures, 
numClasses, seed - 1)
    +    val outlierDF = df.as[Instance].flatMap { case Instance(l, w, f) =>
    +      List.fill(3)(Instance(outlierFunction(f), 0.0001, f)) ++ 
List(Instance(l, w, f))
    +    }
    +    val trueModel = estimator.set(estimator.weightCol, "").fit(df)
    +    val outlierModel = estimator.set(estimator.weightCol, 
"weight").fit(outlierDF)
    +    modelEquals(trueModel, outlierModel)
    +  }
    +
    +  /**
    +   * Helper function for testing sample weights. Tests that giving 
constant weights to each data
    +   * point yields the same model, regardless of the magnitude of the 
weight.
    +   */
    +  def testArbitrarilyScaledWeights[M <: Model[M], E <: Estimator[M]](
    +      data: Dataset[LabeledPoint],
    +      estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol,
    +      modelEquals: (M, M) => Unit): Unit = {
    +    estimator
    +      .set(estimator.labelCol, "label")
    +      .set(estimator.featuresCol, "features")
    +      .set(estimator.weightCol, "weight")
    +    val models = Seq(0.001, 1.0, 1000.0).map { w =>
    --- End diff --
    
    I disagree. 1.0 and 1000.0 are both integers, and I have already had 
experience with algorithms not properly handling fractional weights before. I 
think here we cover the case of (tiny, unit, and large) weights as well as 
(fractional, integer) weights. 


---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to