[ https://issues.apache.org/jira/browse/SPARK-10188?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Noel Smith updated SPARK-10188: ------------------------------- Description: Pyspark {{CrossValidator}} is giving incorrect results when selecting estimators using RMSE as an evaluation metric. In the example below, it should be selecting the {{LogisticRegression}} estimator with zero regularization as that gives the most accurate result, but instead it selects the one with the largest. Probably related to: SPARK-10097 {code} from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import LinearRegression from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel from pyspark.ml.feature import Binarizer from pyspark.mllib.linalg import Vectors from pyspark.sql import SQLContext sqlContext = SQLContext(sc) # Label = 2 * feature train = sqlContext.createDataFrame([ (Vectors.dense([10.0]), 20.0), (Vectors.dense([100.0]), 200.0), (Vectors.dense([1000.0]), 2000.0)] * 10, ["features", "label"]) test = sqlContext.createDataFrame([ (Vectors.dense([1000.0]),)], ["features"]) # Expected prediction 2000.0 print LinearRegression(regParam=0.0).fit(train).transform(test).collect() # Predicts 2000.0 (perfect) print LinearRegression(regParam=100.0).fit(train).transform(test).collect() # Predicts 1869.31 print LinearRegression(regParam=1000000.0).fit(train).transform(test).collect() # 741.08 (worst) # Cross-validation lr = LinearRegression() rmse_eval = RegressionEvaluator(metricName="rmse") grid = (ParamGridBuilder() .addGrid( lr.regParam, [0.0, 100.0, 1000000.0] ) .build()) cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=rmse_eval) cv_model = cv.fit(train) cv_model.bestModel.transform(test).collect() # Predicts 741.08 (i.e. worst model selected) {code} Once workaround for users would be to add a wrapper around the selected evaluator to invert the metric: {code} class InvertedEvaluator(Evaluator): def __init__(self, evaluator): super(Evaluator, self).__init__() self.evaluator = evaluator def _evaluate(self, dataset): return -self.evaluator.evaluate(dataset) invertedEvaluator = InvertedEvaluator(RegressionEvaluator(metricName="rmse")) {code} was: Pyspark {{CrossValidator}} is giving incorrect results when selecting estimators using RMSE as an evaluation metric. In the example below, it should be selecting the {{LogisticRegression}} estimator with zero regularization as that gives the most accurate result, but instead it selects the one with the largest. Probably related to: SPARK-10097 {code} from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml.regression import LinearRegression from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel from pyspark.ml.feature import Binarizer from pyspark.mllib.linalg import Vectors from pyspark.sql import SQLContext sqlContext = SQLContext(sc) # Label = 2 * feature train = sqlContext.createDataFrame([ (Vectors.dense([10.0]), 20.0), (Vectors.dense([100.0]), 200.0), (Vectors.dense([1000.0]), 2000.0)] * 10, ["features", "label"]) test = sqlContext.createDataFrame([ (Vectors.dense([1000.0]),)], ["features"]) # Expected prediction 2000.0 print LinearRegression(regParam=0.0).fit(train).transform(test).collect() # Predicts 2000.0 (perfect) print LinearRegression(regParam=100.0).fit(train).transform(test).collect() # Predicts 1869.31 print LinearRegression(regParam=1000000.0).fit(train).transform(test).collect() # 741.08 (worst) # Cross-validation lr = LinearRegression() rmse_eval = RegressionEvaluator() grid = (ParamGridBuilder() .addGrid( lr.regParam, [0.0, 100.0, 1000000.0] ) .build()) cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=rmse_eval) cv_model = cv.fit(train) cv_model.bestModel.transform(test).collect() # Predicts 741.08 (i.e. worst model selected) {code} Once workaround for users would be to add a wrapper around the selected evaluator to invert the metric: {code} class InvertedEvaluator(Evaluator): def __init__(self, evaluator): super(Evaluator, self).__init__() self.evaluator = evaluator def _evaluate(self, dataset): return -self.evaluator.evaluate(dataset) invertedEvaluator = InvertedEvaluator(RegressionEvaluator(metricName="rmse")) {code} > Pyspark CrossValidator with RMSE selects incorrect model > -------------------------------------------------------- > > Key: SPARK-10188 > URL: https://issues.apache.org/jira/browse/SPARK-10188 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 1.5.0 > Reporter: Noel Smith > > Pyspark {{CrossValidator}} is giving incorrect results when selecting > estimators using RMSE as an evaluation metric. > In the example below, it should be selecting the {{LogisticRegression}} > estimator with zero regularization as that gives the most accurate result, > but instead it selects the one with the largest. > Probably related to: SPARK-10097 > {code} > from pyspark.ml.evaluation import RegressionEvaluator > from pyspark.ml.regression import LinearRegression > from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, > CrossValidatorModel > from pyspark.ml.feature import Binarizer > from pyspark.mllib.linalg import Vectors > from pyspark.sql import SQLContext > sqlContext = SQLContext(sc) > # Label = 2 * feature > train = sqlContext.createDataFrame([ > (Vectors.dense([10.0]), 20.0), > (Vectors.dense([100.0]), 200.0), > (Vectors.dense([1000.0]), 2000.0)] * 10, > ["features", "label"]) > test = sqlContext.createDataFrame([ > (Vectors.dense([1000.0]),)], > ["features"]) > # Expected prediction 2000.0 > print LinearRegression(regParam=0.0).fit(train).transform(test).collect() # > Predicts 2000.0 (perfect) > print LinearRegression(regParam=100.0).fit(train).transform(test).collect() # > Predicts 1869.31 > print > LinearRegression(regParam=1000000.0).fit(train).transform(test).collect() # > 741.08 (worst) > # Cross-validation > lr = LinearRegression() > rmse_eval = RegressionEvaluator(metricName="rmse") > grid = (ParamGridBuilder() > .addGrid( lr.regParam, [0.0, 100.0, 1000000.0] ) > .build()) > cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, > evaluator=rmse_eval) > cv_model = cv.fit(train) > cv_model.bestModel.transform(test).collect() # Predicts 741.08 (i.e. worst > model selected) > {code} > Once workaround for users would be to add a wrapper around the selected > evaluator to invert the metric: > {code} > class InvertedEvaluator(Evaluator): > def __init__(self, evaluator): > super(Evaluator, self).__init__() > self.evaluator = evaluator > > def _evaluate(self, dataset): > return -self.evaluator.evaluate(dataset) > invertedEvaluator = InvertedEvaluator(RegressionEvaluator(metricName="rmse")) > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org