[ 
https://issues.apache.org/jira/browse/SPARK-22971?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

zhengruifeng updated SPARK-22971:
---------------------------------
    Affects Version/s:     (was: 2.3.0)
                       2.4.0

> OneVsRestModel should use temporary RawPredictionCol
> ----------------------------------------------------
>
>                 Key: SPARK-22971
>                 URL: https://issues.apache.org/jira/browse/SPARK-22971
>             Project: Spark
>          Issue Type: Bug
>          Components: ML
>    Affects Versions: 2.4.0
>            Reporter: zhengruifeng
>            Priority: Minor
>
> Issue occurs when I transform one dataframe with two different classification 
> models, first by a {{RandomForestClassificationModel}}, then a 
> {{OneVsRestModel}}.
> The first transform generate a new colum "rawPrediction", which will be 
> internally used in {{OneVsRestModel#transform}} and cause failure.
> {code}
> scala> val df = 
> spark.read.format("libsvm").load("/Users/zrf/Dev/OpenSource/spark/data/mllib/sample_multiclass_classification_data.txt")
> 18/01/05 17:08:18 WARN ObjectStore: Failed to get database global_temp, 
> returning NoSuchObjectException
> df: org.apache.spark.sql.DataFrame = [label: double, features: vector]
> scala> val rf = new RandomForestClassifier()
> rf: org.apache.spark.ml.classification.RandomForestClassifier = 
> rfc_c11b1e1e1f7f
> scala> val rfm = rf.fit(df)
> rfm: org.apache.spark.ml.classification.RandomForestClassificationModel = 
> RandomForestClassificationModel (uid=rfc_c11b1e1e1f7f) with 20 trees
> scala> val lr = new LogisticRegression().setMaxIter(1)
> lr: org.apache.spark.ml.classification.LogisticRegression = 
> logreg_f5a5285eba06
> scala> val ovr = new OneVsRest().setClassifier(lr)
> ovr: org.apache.spark.ml.classification.OneVsRest = oneVsRest_8f5584190634
> scala> val ovrModel = ovr.fit(df)
> ovrModel: org.apache.spark.ml.classification.OneVsRestModel = 
> oneVsRest_8f5584190634
> scala> val df2 = rfm.setPredictionCol("rfPred").transform(df)
> df2: org.apache.spark.sql.DataFrame = [label: double, features: vector ... 3 
> more fields]
> scala> val df3 = ovrModel.setPredictionCol("ovrPred").transform(df2)
> java.lang.IllegalArgumentException: requirement failed: Column rawPrediction 
> already exists.
>   at scala.Predef$.require(Predef.scala:224)
>   at org.apache.spark.ml.util.SchemaUtils$.appendColumn(SchemaUtils.scala:101)
>   at org.apache.spark.ml.util.SchemaUtils$.appendColumn(SchemaUtils.scala:91)
>   at 
> org.apache.spark.ml.classification.ClassifierParams$class.validateAndTransformSchema(Classifier.scala:43)
>   at 
> org.apache.spark.ml.classification.ProbabilisticClassificationModel.org$apache$spark$ml$classification$ProbabilisticClassifierParams$$super$validateAndTransformSchema(ProbabilisticClassifier.scala:77)
>   at 
> org.apache.spark.ml.classification.ProbabilisticClassifierParams$class.validateAndTransformSchema(ProbabilisticClassifier.scala:37)
>   at 
> org.apache.spark.ml.classification.LogisticRegressionModel.org$apache$spark$ml$classification$LogisticRegressionParams$$super$validateAndTransformSchema(LogisticRegression.scala:904)
>   at 
> org.apache.spark.ml.classification.LogisticRegressionParams$class.validateAndTransformSchema(LogisticRegression.scala:265)
>   at 
> org.apache.spark.ml.classification.LogisticRegressionModel.validateAndTransformSchema(LogisticRegression.scala:904)
>   at org.apache.spark.ml.PredictionModel.transformSchema(Predictor.scala:192)
>   at org.apache.spark.ml.PipelineStage.transformSchema(Pipeline.scala:74)
>   at 
> org.apache.spark.ml.classification.ProbabilisticClassificationModel.transform(ProbabilisticClassifier.scala:104)
>   at 
> org.apache.spark.ml.classification.OneVsRestModel$$anonfun$7.apply(OneVsRest.scala:184)
>   at 
> org.apache.spark.ml.classification.OneVsRestModel$$anonfun$7.apply(OneVsRest.scala:173)
>   at 
> scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57)
>   at 
> scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66)
>   at scala.collection.mutable.ArrayOps$ofRef.foldLeft(ArrayOps.scala:186)
>   at 
> org.apache.spark.ml.classification.OneVsRestModel.transform(OneVsRest.scala:173)
>   ... 50 elided
> {code}
> {{OneVsRestModel#transform}} only generates a new prediction column, and 
> should not fail by other columns.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to