[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/15721 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891844 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -18,15 +18,15 @@ package org.apache.spark.ml.util import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.attribute.NominalAttribute +import org.apache.spark.ml._ import org.apache.spark.ml.evaluation.Evaluator -import org.apache.spark.ml.feature.Instance -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.feature.{Instance, LabeledPoint} +import org.apache.spark.ml.linalg.{BLAS, DenseMatrix, DenseVector, Vector, Vectors} --- End diff -- done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891724 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala --- @@ -1810,52 +1810,23 @@ class LogisticRegressionSuite } - test("binary logistic regression with weighted data") { -val numClasses = 2 -val numPoints = 40 -val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, - numClasses, numPoints) -val testData = Array.tabulate[LabeledPoint](numClasses) { i => - LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) -}.toSeq.toDF() -val lr = new LogisticRegression().setFamily("binomial").setWeightCol("weight") -val model = lr.fit(outlierData) -val results = model.transform(testData).select("label", "prediction").collect() - -// check that the predictions are the one to one mapping -results.foreach { case Row(label: Double, pred: Double) => - assert(label === pred) + test("logistic regression with sample weights") { +def modelEquals(m1: LogisticRegressionModel, m2: LogisticRegressionModel): Unit = { + assert(m1.coefficientMatrix ~== m2.coefficientMatrix absTol 0.01) --- End diff -- Done, thanks. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891824 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala --- @@ -157,50 +162,26 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa validateProbabilities(featureAndProbabilities, model, "multinomial") } - test("Naive Bayes Multinomial with weighted samples") { -val nPoints = 1000 -val piArray = Array(0.5, 0.1, 0.4).map(math.log) -val thetaArray = Array( - Array(0.70, 0.10, 0.10, 0.10), // label 0 - Array(0.10, 0.70, 0.10, 0.10), // label 1 - Array(0.10, 0.10, 0.70, 0.10) // label 2 -).map(_.map(math.log)) - -val testData = generateNaiveBayesInput(piArray, thetaArray, nPoints, 42, "multinomial").toDF() -val (overSampledData, weightedData) = - MLTestingUtils.genEquivalentOversampledAndWeightedInstances(testData, -"label", "features", 42L) -val nb = new NaiveBayes().setModelType("multinomial") -val unweightedModel = nb.fit(weightedData) -val overSampledModel = nb.fit(overSampledData) -val weightedModel = nb.setWeightCol("weight").fit(weightedData) -assert(weightedModel.theta ~== overSampledModel.theta relTol 0.001) -assert(weightedModel.pi ~== overSampledModel.pi relTol 0.001) -assert(unweightedModel.theta !~= overSampledModel.theta relTol 0.001) -assert(unweightedModel.pi !~= overSampledModel.pi relTol 0.001) - } - - test("Naive Bayes Bernoulli with weighted samples") { -val nPoints = 1 -val piArray = Array(0.5, 0.3, 0.2).map(math.log) -val thetaArray = Array( - Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0 - Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1 - Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2 -).map(_.map(math.log)) - -val testData = generateNaiveBayesInput(piArray, thetaArray, nPoints, 42, "bernoulli").toDF() -val (overSampledData, weightedData) = - MLTestingUtils.genEquivalentOversampledAndWeightedInstances(testData, -"label", "features", 42L) -val nb = new NaiveBayes().setModelType("bernoulli") -val unweightedModel = nb.fit(weightedData) -val overSampledModel = nb.fit(overSampledData) -val weightedModel = nb.setWeightCol("weight").fit(weightedData) -assert(weightedModel.theta ~== overSampledModel.theta relTol 0.001) -assert(weightedModel.pi ~== overSampledModel.pi relTol 0.001) -assert(unweightedModel.theta !~= overSampledModel.theta relTol 0.001) -assert(unweightedModel.pi !~= overSampledModel.pi relTol 0.001) + test("Naive Bayes with weighted samples") { +val numClasses = 3 +def modelEquals(m1: NaiveBayesModel, m2: NaiveBayesModel): Unit = { + assert(m1.pi ~== m2.pi relTol 0.01) + assert(m1.theta ~== m2.theta relTol 0.01) +} +val testParams = Seq( + ("bernoulli", bernoulliDataset), + ("multinomial", dataset) +) +testParams.foreach { case (family, dataset) => + // NaiveBayes is sensitive to constant scaling of the weights unless smoothing is set to 0 + val estimator = new NaiveBayes().setSmoothing(0.0).setModelType(family) --- End diff -- I think the test with smoothing as 0.0 is a nice check on the weighting algorithm for Naive Bayes, so I prefer to keep it. I made separate smoothing/no smoothing estimators. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891797 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala --- @@ -25,31 +25,36 @@ import breeze.stats.distributions.{Multinomial => BrzMultinomial} import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.classification.NaiveBayes.{Bernoulli, Multinomial} import org.apache.spark.ml.classification.NaiveBayesSuite._ -import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.functions.lit class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var dataset: Dataset[_] = _ + @transient var bernoulliDataset: Dataset[_] = _ + + private val seed = 42 override def beforeAll(): Unit = { super.beforeAll() -val pi = Array(0.5, 0.1, 0.4).map(math.log) +val pi = Array(0.3, 0.3, 0.4).map(math.log) val theta = Array( - Array(0.70, 0.10, 0.10, 0.10), // label 0 - Array(0.10, 0.70, 0.10, 0.10), // label 1 - Array(0.10, 0.10, 0.70, 0.10) // label 2 + Array(0.30, 0.30, 0.30, 0.30), // label 0 + Array(0.30, 0.30, 0.30, 0.30), // label 1 + Array(0.40, 0.40, 0.40, 0.40) // label 2 --- End diff -- Ya this is changed so that when we set smoothing to zero for the weighted tests, we don't get some theta values of infinity. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891833 --- Diff: mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala --- @@ -47,6 +49,11 @@ class LinearRegressionSuite datasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 1, seed, eps = 0.1), 2).map(_.asML).toDF() + +weightedDatasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( --- End diff -- Done. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891849 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -182,34 +182,18 @@ object MLTestingUtils extends SparkFunSuite { .toMap } - def genClassificationInstancesWithWeightedOutliers( - spark: SparkSession, - numClasses: Int, - numInstances: Int): DataFrame = { -val data = Array.tabulate[Instance](numInstances) { i => - val feature = i % numClasses - if (i < numInstances / 3) { -// give large weights to minority of data with 1 to 1 mapping feature to label -Instance(feature, 1.0, Vectors.dense(feature)) - } else { -// give small weights to majority of data points with reverse mapping -Instance(numClasses - feature - 1, 0.01, Vectors.dense(feature)) - } -} -val labelMeta = - NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses).toMetadata() -spark.createDataFrame(data).select(col("label").as("label", labelMeta), col("weight"), - col("features")) - } - + /** + * Given a dataframe, generate two output dataframes: one having the original rows oversampled + * an integer number of times, and one having the original rows but with a column of weights + * proportional to the number of oversampled instances in the oversampled dataframe. + */ --- End diff -- done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891892 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,59 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( + df: DataFrame, + estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, + modelEquals: (M, M) => Unit, + seed: Long): Unit = { +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the true + * model despite the outliers. + */ + def testOutliersWithSmallWeights[M <: Model[M], E <: Estimator[M]]( + ds: Dataset[Instance], --- End diff -- done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93891859 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -182,34 +182,18 @@ object MLTestingUtils extends SparkFunSuite { .toMap } - def genClassificationInstancesWithWeightedOutliers( - spark: SparkSession, - numClasses: Int, - numInstances: Int): DataFrame = { -val data = Array.tabulate[Instance](numInstances) { i => - val feature = i % numClasses - if (i < numInstances / 3) { -// give large weights to minority of data with 1 to 1 mapping feature to label -Instance(feature, 1.0, Vectors.dense(feature)) - } else { -// give small weights to majority of data points with reverse mapping -Instance(numClasses - feature - 1, 0.01, Vectors.dense(feature)) - } -} -val labelMeta = - NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses).toMetadata() -spark.createDataFrame(data).select(col("label").as("label", labelMeta), col("weight"), - col("features")) - } - + /** + * Given a dataframe, generate two output dataframes: one having the original rows oversampled + * an integer number of times, and one having the original rows but with a column of weights + * proportional to the number of oversampled instances in the oversampled dataframe. + */ def genEquivalentOversampledAndWeightedInstances( --- End diff -- I made them all take `Dataset[LabeledPoint]`. Good suggestion. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93763421 --- Diff: mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala --- @@ -47,6 +49,11 @@ class LinearRegressionSuite datasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 1, seed, eps = 0.1), 2).map(_.asML).toDF() + +weightedDatasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( --- End diff -- ```datasetWithStrongNoise```? I think ```weighted*``` is really misleading. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93760292 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -182,34 +182,18 @@ object MLTestingUtils extends SparkFunSuite { .toMap } - def genClassificationInstancesWithWeightedOutliers( - spark: SparkSession, - numClasses: Int, - numInstances: Int): DataFrame = { -val data = Array.tabulate[Instance](numInstances) { i => - val feature = i % numClasses - if (i < numInstances / 3) { -// give large weights to minority of data with 1 to 1 mapping feature to label -Instance(feature, 1.0, Vectors.dense(feature)) - } else { -// give small weights to majority of data points with reverse mapping -Instance(numClasses - feature - 1, 0.01, Vectors.dense(feature)) - } -} -val labelMeta = - NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses).toMetadata() -spark.createDataFrame(data).select(col("label").as("label", labelMeta), col("weight"), - col("features")) - } - + /** + * Given a dataframe, generate two output dataframes: one having the original rows oversampled + * an integer number of times, and one having the original rows but with a column of weights + * proportional to the number of oversampled instances in the oversampled dataframe. + */ --- End diff -- Nit: ```dataframe``` -> ```DataFrame``` for all occurrences. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93761461 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -182,34 +182,18 @@ object MLTestingUtils extends SparkFunSuite { .toMap } - def genClassificationInstancesWithWeightedOutliers( - spark: SparkSession, - numClasses: Int, - numInstances: Int): DataFrame = { -val data = Array.tabulate[Instance](numInstances) { i => - val feature = i % numClasses - if (i < numInstances / 3) { -// give large weights to minority of data with 1 to 1 mapping feature to label -Instance(feature, 1.0, Vectors.dense(feature)) - } else { -// give small weights to majority of data points with reverse mapping -Instance(numClasses - feature - 1, 0.01, Vectors.dense(feature)) - } -} -val labelMeta = - NominalAttribute.defaultAttr.withName("label").withNumValues(numClasses).toMetadata() -spark.createDataFrame(data).select(col("label").as("label", labelMeta), col("weight"), - col("features")) - } - + /** + * Given a dataframe, generate two output dataframes: one having the original rows oversampled + * an integer number of times, and one having the original rows but with a column of weights + * proportional to the number of oversampled instances in the oversampled dataframe. + */ def genEquivalentOversampledAndWeightedInstances( --- End diff -- For this and the following three functions, some one uses ```data: DataFrame, labelCol: String, featuresCol```, while others use ```data: Dataset[LabeledPoint]``` or ```data: Dataset[Instance]``` as the arguments. Could we make the arguments consistent? I'm prefer the latter one. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93765361 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala --- @@ -25,31 +25,36 @@ import breeze.stats.distributions.{Multinomial => BrzMultinomial} import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.classification.NaiveBayes.{Bernoulli, Multinomial} import org.apache.spark.ml.classification.NaiveBayesSuite._ -import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.sql.functions.lit class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var dataset: Dataset[_] = _ + @transient var bernoulliDataset: Dataset[_] = _ + + private val seed = 42 override def beforeAll(): Unit = { super.beforeAll() -val pi = Array(0.5, 0.1, 0.4).map(math.log) +val pi = Array(0.3, 0.3, 0.4).map(math.log) val theta = Array( - Array(0.70, 0.10, 0.10, 0.10), // label 0 - Array(0.10, 0.70, 0.10, 0.10), // label 1 - Array(0.10, 0.10, 0.70, 0.10) // label 2 + Array(0.30, 0.30, 0.30, 0.30), // label 0 + Array(0.30, 0.30, 0.30, 0.30), // label 1 + Array(0.40, 0.40, 0.40, 0.40) // label 2 --- End diff -- Could you let me know why you change this? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93762729 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala --- @@ -1810,52 +1810,23 @@ class LogisticRegressionSuite } - test("binary logistic regression with weighted data") { -val numClasses = 2 -val numPoints = 40 -val outlierData = MLTestingUtils.genClassificationInstancesWithWeightedOutliers(spark, - numClasses, numPoints) -val testData = Array.tabulate[LabeledPoint](numClasses) { i => - LabeledPoint(i.toDouble, Vectors.dense(i.toDouble)) -}.toSeq.toDF() -val lr = new LogisticRegression().setFamily("binomial").setWeightCol("weight") -val model = lr.fit(outlierData) -val results = model.transform(testData).select("label", "prediction").collect() - -// check that the predictions are the one to one mapping -results.foreach { case Row(label: Double, pred: Double) => - assert(label === pred) + test("logistic regression with sample weights") { +def modelEquals(m1: LogisticRegressionModel, m2: LogisticRegressionModel): Unit = { + assert(m1.coefficientMatrix ~== m2.coefficientMatrix absTol 0.01) --- End diff -- Should we also check ```interceptVector```? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93762341 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,59 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( + df: DataFrame, + estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, + modelEquals: (M, M) => Unit, + seed: Long): Unit = { +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the true + * model despite the outliers. + */ + def testOutliersWithSmallWeights[M <: Model[M], E <: Estimator[M]]( + ds: Dataset[Instance], --- End diff -- I'd prefer to change this to ```data: Dataset[LabeledPoint]```(pass in dataset w/o weight), and move ```.withColumn("weight", lit(1.0))```(which are duplicated in test cases of each algorithms currently) inside this function. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93765319 --- Diff: mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala --- @@ -157,50 +162,26 @@ class NaiveBayesSuite extends SparkFunSuite with MLlibTestSparkContext with Defa validateProbabilities(featureAndProbabilities, model, "multinomial") } - test("Naive Bayes Multinomial with weighted samples") { -val nPoints = 1000 -val piArray = Array(0.5, 0.1, 0.4).map(math.log) -val thetaArray = Array( - Array(0.70, 0.10, 0.10, 0.10), // label 0 - Array(0.10, 0.70, 0.10, 0.10), // label 1 - Array(0.10, 0.10, 0.70, 0.10) // label 2 -).map(_.map(math.log)) - -val testData = generateNaiveBayesInput(piArray, thetaArray, nPoints, 42, "multinomial").toDF() -val (overSampledData, weightedData) = - MLTestingUtils.genEquivalentOversampledAndWeightedInstances(testData, -"label", "features", 42L) -val nb = new NaiveBayes().setModelType("multinomial") -val unweightedModel = nb.fit(weightedData) -val overSampledModel = nb.fit(overSampledData) -val weightedModel = nb.setWeightCol("weight").fit(weightedData) -assert(weightedModel.theta ~== overSampledModel.theta relTol 0.001) -assert(weightedModel.pi ~== overSampledModel.pi relTol 0.001) -assert(unweightedModel.theta !~= overSampledModel.theta relTol 0.001) -assert(unweightedModel.pi !~= overSampledModel.pi relTol 0.001) - } - - test("Naive Bayes Bernoulli with weighted samples") { -val nPoints = 1 -val piArray = Array(0.5, 0.3, 0.2).map(math.log) -val thetaArray = Array( - Array(0.50, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.40), // label 0 - Array(0.02, 0.70, 0.10, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02), // label 1 - Array(0.02, 0.02, 0.60, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.30) // label 2 -).map(_.map(math.log)) - -val testData = generateNaiveBayesInput(piArray, thetaArray, nPoints, 42, "bernoulli").toDF() -val (overSampledData, weightedData) = - MLTestingUtils.genEquivalentOversampledAndWeightedInstances(testData, -"label", "features", 42L) -val nb = new NaiveBayes().setModelType("bernoulli") -val unweightedModel = nb.fit(weightedData) -val overSampledModel = nb.fit(overSampledData) -val weightedModel = nb.setWeightCol("weight").fit(weightedData) -assert(weightedModel.theta ~== overSampledModel.theta relTol 0.001) -assert(weightedModel.pi ~== overSampledModel.pi relTol 0.001) -assert(unweightedModel.theta !~= overSampledModel.theta relTol 0.001) -assert(unweightedModel.pi !~= overSampledModel.pi relTol 0.001) + test("Naive Bayes with weighted samples") { +val numClasses = 3 +def modelEquals(m1: NaiveBayesModel, m2: NaiveBayesModel): Unit = { + assert(m1.pi ~== m2.pi relTol 0.01) + assert(m1.theta ~== m2.theta relTol 0.01) +} +val testParams = Seq( + ("bernoulli", bernoulliDataset), + ("multinomial", dataset) +) +testParams.foreach { case (family, dataset) => + // NaiveBayes is sensitive to constant scaling of the weights unless smoothing is set to 0 + val estimator = new NaiveBayes().setSmoothing(0.0).setModelType(family) --- End diff -- I think it's not practical to set smoothing as 0.0, so it's better to test NB with none zero smoothing value. If it does not applicable to ```testArbitrarilyScaledWeights```, we can omit it. Or generating two estimators whose smoothing values are respectively 0.0 and 1.0 for different test functions. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93759584 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -18,15 +18,15 @@ package org.apache.spark.ml.util import org.apache.spark.SparkFunSuite -import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.attribute.NominalAttribute +import org.apache.spark.ml._ import org.apache.spark.ml.evaluation.Evaluator -import org.apache.spark.ml.feature.Instance -import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.feature.{Instance, LabeledPoint} +import org.apache.spark.ml.linalg.{BLAS, DenseMatrix, DenseVector, Vector, Vectors} --- End diff -- ```BLAS, DenseMatrix, DenseVector``` were not used and can be removed. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93481050 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, --- End diff -- done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93481882 --- Diff: mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala --- @@ -47,6 +49,11 @@ class LinearRegressionSuite datasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3), xVariance = Array(0.7, 1.2), nPoints = 1, seed, eps = 0.1), 2).map(_.asML).toDF() + +weightedDatasetWithDenseFeature = sc.parallelize(LinearDataGenerator.generateLinearInput( --- End diff -- I added this small dataset with a higher noise value for weighted testing. It's necessary because when we test oversampling vs weighting, we need the noise to be high enough that the model learns incorrect coefficients when the weights are not applied. The coefficients used to generate each point are the same, but some points are emphasized more with weights. This dataset needs to be small enough and have enough noise that it doesn't still learn the true coefficients when the weights are not applied, if that makes sense. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93481080 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, --- End diff -- done --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93481030 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( --- End diff -- Fair point. Actually, the noise is not strictly necessary for this patch in the other cases. I can use the existing datasets (for the most part). I removed this generator and passed the test data to the testing util methods. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93459365 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, + seed).toDF() +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the t
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user sethah commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93420640 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, + seed).toDF() +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the true
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93174061 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, + seed).toDF() +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the t
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93172081 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, --- End diff -- If we add noise in native data generators(see my above comment), we should remove this line and pass in the generated dataset(which already includes noise) directly. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93172224 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, + seed).toDF() +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the t
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93171343 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( --- End diff -- I am a bit worried whether we should provide this general noisy data generation function: * It's better we can generate data following the rule of specific algorithms, for example, users provide coefficients, the mean and variance of generated features for ```LogisticRegression```. * Actually, some generators such as [```LinearDataGenerator.generateLinearInput```](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/util/LinearDataGenerator.scala#L97) has already considered the noise level. Just like ```LinearDataGenerator.generateLinearInput```, I think we should add argument ```eps``` for other generators such as ```LogisticRegressionSuite.generateLogisticInput, LogisticRegressionSuite.generateMultinomialLogisticInput, NaiveBayesSuite.generateNaiveBayesInput```, to make them output noisy label natively. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93172654 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, +estimator: E with HasWeightCol with HasLabelCol with HasFeaturesCol, +categoricalFeaturesInfo: Map[Int, Int], +numPoints: Int, +numClasses: Int, +numFeatures: Int, +modelEquals: (M, M) => Unit, +seed: Long): Unit = { +import spark.implicits._ +val df = generateNoisyData(numPoints, numClasses, numFeatures, categoricalFeaturesInfo, + seed).toDF() +val (overSampledData, weightedData) = genEquivalentOversampledAndWeightedInstances( + df, estimator.getLabelCol, estimator.getFeaturesCol, seed) +val weightedModel = estimator.set(estimator.weightCol, "weight").fit(weightedData) +val overSampledModel = estimator.set(estimator.weightCol, "").fit(overSampledData) +modelEquals(weightedModel, overSampledModel) + } + + /** + * Helper function for testing sample weights. Tests that injecting a large number of outliers + * with very small sample weights does not affect fitting. The predictor should learn the t
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
Github user yanboliang commented on a diff in the pull request: https://github.com/apache/spark/pull/15721#discussion_r93172182 --- Diff: mllib/src/test/scala/org/apache/spark/ml/util/MLTestingUtils.scala --- @@ -224,4 +208,139 @@ object MLTestingUtils extends SparkFunSuite { }.toDF() (overSampledData, weightedData) } + + /** + * Generates a linear prediction function where the coefficients are generated randomly. + * The function produces a continuous (numClasses = 0) or categorical (numClasses > 0) label. + */ + def getRandomLinearPredictionFunction( + numFeatures: Int, + numClasses: Int, + seed: Long): (Vector => Double) = { +val rng = new scala.util.Random(seed) +val trueNumClasses = if (numClasses == 0) 1 else numClasses +val coefArray = Array.fill(numFeatures * trueNumClasses)(rng.nextDouble - 0.5) +(features: Vector) => { + if (numClasses == 0) { +BLAS.dot(features, new DenseVector(coefArray)) + } else { +val margins = new DenseVector(new Array[Double](numClasses)) +val coefMat = new DenseMatrix(numClasses, numFeatures, coefArray) +BLAS.gemv(1.0, coefMat, features, 1.0, margins) +margins.argmax.toDouble + } +} + } + + /** + * A helper function to generate synthetic data. Generates random feature values, + * both categorical and continuous, according to `categoricalFeaturesInfo`. The label is generated + * from a random prediction function, and noise is added to the true label. + * + * @param numPoints The number of data points to generate. + * @param numClasses The number of classes the outcome can take on. 0 for continuous labels. + * @param numFeatures The number of features in the data. + * @param categoricalFeaturesInfo Map of (featureIndex -> numCategories) for categorical features. + * @param seed Random seed. + * @param noiseLevel A number in [0.0, 1.0] indicating how much noise to add to the label. + * @return Generated sequence of noisy instances. + */ + def generateNoisyData( + numPoints: Int, + numClasses: Int, + numFeatures: Int, + categoricalFeaturesInfo: Map[Int, Int], + seed: Long, + noiseLevel: Double = 0.3): Seq[Instance] = { +require(noiseLevel >= 0.0 && noiseLevel <= 1.0, "noiseLevel must be in range [0.0, 1.0]") +val rng = new scala.util.Random(seed) +val predictionFunc = getRandomLinearPredictionFunction(numFeatures, numClasses, seed) +Range(0, numPoints).map { i => + val features = Vectors.dense(Array.tabulate(numFeatures) { j => +val numCategories = categoricalFeaturesInfo.getOrElse(j, 0) +if (numCategories > 0) { + rng.nextInt(numCategories) +} else { + rng.nextDouble() - 0.5 +} + }) + val label = predictionFunc(features) + val noisyLabel = if (numClasses > 0) { +// with probability equal to noiseLevel, select a random class instead of the true class +if (rng.nextDouble < noiseLevel) rng.nextInt(numClasses) else label + } else { +// add noise to the label proportional to the noise level +label + noiseLevel * rng.nextGaussian() + } + Instance(noisyLabel, 1.0, features) +} + } + + /** + * Helper function for testing sample weights. Tests that oversampling each point is equivalent + * to assigning a sample weight proportional to the number of samples for each point. + */ + def testOversamplingVsWeighting[M <: Model[M], E <: Estimator[M]]( +spark: SparkSession, --- End diff -- Indent. --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #15721: [SPARK-17772][ML][TEST] Add test functions for ML...
GitHub user sethah opened a pull request: https://github.com/apache/spark/pull/15721 [SPARK-17772][ML][TEST] Add test functions for ML sample weights ## What changes were proposed in this pull request? More and more ML algos are accepting sample weights, and they have been tested rather heterogeneously and with code duplication. This patch adds extensible helper methods to `MLTestingUtils` that can be reused by various algorithms accepting sample weights. Up to now, there seems to be a few tests that have been implemented commonly: * Check that oversampling is the same as giving the instances sample weights proportional to the number of samples * Check that outliers with tiny sample weights do not affect the algorithm's performance This patch adds an additional test: * Check that algorithms are invariant to constant scaling of the sample weights. i.e. uniform sample weights with `w_i = 1.0` is effectively the same as uniform sample weights with `w_i = 1` or `w_i = 0.0001` The instances of these tests occurred in LinearRegression, NaiveBayes, and LogisticRegression. Those tests have been removed/modified to use the new helper methods. These helper functions will be of use when [SPARK-9478](https://issues.apache.org/jira/browse/SPARK-9478) is implemented. ## How was this patch tested? This patch only involves modifying test suites. ## Other notes Both IsotonicRegression and GeneralizedLinearRegression also extend `HasWeightCol`. I did not modify these test suites because it will make this patch easier to review, and because they did not duplicate the same tests as the three suites that were modified. If we want to change them later, we can create a JIRA for it now, but it's open for debate. You can merge this pull request into a Git repository by running: $ git pull https://github.com/sethah/spark SPARK-17772 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/15721.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #15721 commit e10be455ee943230a96e57370b718683647e6f03 Author: sethah Date: 2016-10-18T21:27:02Z add sample weight helper tests --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org