[ https://issues.apache.org/jira/browse/SPARK-14843?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Cheng Lian resolved SPARK-14843. -------------------------------- Resolution: Fixed Fix Version/s: 2.0.0 Issue resolved by pull request 12611 [https://github.com/apache/spark/pull/12611] > Error while encoding: java.lang.ClassCastException with LibSVMRelation > ---------------------------------------------------------------------- > > Key: SPARK-14843 > URL: https://issues.apache.org/jira/browse/SPARK-14843 > Project: Spark > Issue Type: Bug > Components: ML, MLlib, SQL > Reporter: Nick Pentreath > Fix For: 2.0.0 > > > While trying to run some example ML linear regression code, I came across the > following. In fact this error occurs when doing {{./bin/run-example > ml.LinearRegressionWithElasticNetExample}}. > {code} > scala> import org.apache.spark.ml.regression.LinearRegression > import org.apache.spark.ml.regression.LinearRegression > scala> import org.apache.spark.mllib.linalg.Vector > import org.apache.spark.mllib.linalg.Vector > scala> import org.apache.spark.sql.Row > import org.apache.spark.sql.Row > scala> val data = > sqlContext.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt") > data: org.apache.spark.sql.DataFrame = [label: double, features: vector] > scala> val model = lr.fit(data) > {code} > Stack trace: > {code} > Driver stacktrace: > ... > at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1276) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:357) > at org.apache.spark.rdd.RDD.take(RDD.scala:1250) > at org.apache.spark.rdd.RDD$$anonfun$first$1.apply(RDD.scala:1290) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151) > at > org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112) > at org.apache.spark.rdd.RDD.withScope(RDD.scala:357) > at org.apache.spark.rdd.RDD.first(RDD.scala:1289) > at > org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:165) > at > org.apache.spark.ml.regression.LinearRegression.train(LinearRegression.scala:69) > at org.apache.spark.ml.Predictor.fit(Predictor.scala:90) > ... 48 elided > Caused by: java.lang.RuntimeException: Error while encoding: > java.lang.ClassCastException: java.lang.Double cannot be cast to > org.apache.spark.mllib.linalg.Vector > if (input[0, org.apache.spark.sql.Row].isNullAt) null else newInstance(class > org.apache.spark.mllib.linalg.VectorUDT).serialize > :- input[0, org.apache.spark.sql.Row].isNullAt > : :- input[0, org.apache.spark.sql.Row] > : +- 0 > :- null > +- newInstance(class org.apache.spark.mllib.linalg.VectorUDT).serialize > :- newInstance(class org.apache.spark.mllib.linalg.VectorUDT) > +- input[0, org.apache.spark.sql.Row].get > :- input[0, org.apache.spark.sql.Row] > +- 0 > at > org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:230) > at > org.apache.spark.ml.source.libsvm.DefaultSource$$anonfun$buildReader$1$$anonfun$8.apply(LibSVMRelation.scala:209) > at > org.apache.spark.ml.source.libsvm.DefaultSource$$anonfun$buildReader$1$$anonfun$8.apply(LibSVMRelation.scala:207) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) > at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) > at > org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.next(FileScanRDD.scala:90) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIterator.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegen$$anonfun$7$$anon$1.hasNext(WholeStageCodegen.scala:362) > at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408) > at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:79) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:47) > at org.apache.spark.scheduler.Task.run(Task.scala:85) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:254) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Caused by: java.lang.ClassCastException: java.lang.Double cannot be cast to > org.apache.spark.mllib.linalg.Vector > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown > Source) > at > org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.toRow(ExpressionEncoder.scala:227) > ... 17 more > {code} > The error is triggered by L163 of {{LinearRegression}}: > {code} > val numFeatures = dataset.select(col($(featuresCol))).limit(1).rdd.map { > case Row(features: Vector) => features.size > }.first() > {code} > Using the above example, the following works: > {code} > scala> data.select("label").rdd.map { case Row(d: Double) => d }.first > res49: Double = -9.490009878824548 > {code} > But this triggers the exception: > {code} > scala> data.select("features").rdd.map { case Row(d: Vector) => d }.first > 16/04/22 11:25:20 ERROR Executor: Exception in task 0.0 in stage 87.0 (TID 98) > java.lang.RuntimeException: Error while encoding: > java.lang.ClassCastException: java.lang.Double cannot be cast to > org.apache.spark.mllib.linalg.Vector > ... > {code} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org