[ https://issues.apache.org/jira/browse/SPARK-25723?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
huanghuai updated SPARK-25723: ------------------------------ Environment: local mode Description: {color:#333333}*spark.read()*{color} {color:#333333}*.format("com.myself.datasource")*{color} {color:#333333}*.option("ur","xxxx")*{color} {color:#333333}*.load()*{color} {color:#FF0000}*.show()*{color} {color:#FF0000}*Driver stacktrace:*{color} {color:#FF0000} *at*{color} org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586) ~[spark-core_2.11-2.3.0.jar:2.3.0] at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) ~[scala-library-2.11.8.jar:?] at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) ~[scala-library-2.11.8.jar:?] at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) ~[spark-core_2.11-2.3.0.jar:2.3.0] at scala.Option.foreach(Option.scala:257) ~[scala-library-2.11.8.jar:?] at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.head(Dataset.scala:2484) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.take(Dataset.scala:2698) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.showString(Dataset.scala:254) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.show(Dataset.scala:723) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.show(Dataset.scala:682) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.Dataset.show(Dataset.scala:691) ~[spark-sql_2.11-2.3.0.jar:2.3.0] {color:#FF0000}*Caused by: scala.MatchError: 23.25 (of class java.lang.Double)*{color} {color:#FF0000} *at*{color} org.apache.spark.sql.catalyst.CatalystTypeConverters$StringConverter$.toCatalystImpl(CatalystTypeConverters.scala:276) ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.catalyst.CatalystTypeConverters$StringConverter$.toCatalystImpl(CatalystTypeConverters.scala:275) ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103) ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter$2.apply(CatalystTypeConverters.scala:379) ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$1$$anonfun$apply$3.apply(ExistingRDD.scala:60) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$1$$anonfun$apply$3.apply(ExistingRDD.scala:57) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) ~[scala-library-2.11.8.jar:?] at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) ~[?:?] at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247) ~[spark-sql_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.scheduler.Task.run(Task.scala:109) ~[spark-core_2.11-2.3.0.jar:2.3.0] at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) ~[spark-core_2.11-2.3.0.jar:2.3.0] ... 3 more --------------------------------------------------------------------------------------------------------------- if i use dataset.show() this error will occured. if i use dataset.collectAsList() ,it's ok. I debug at follwing method, {color:#FF0000}*org.apache.spark.sql.execution.RDDConversions.*{color} {color:#FF0000}*rowToRowRdd(data: RDD[Row], outputTypes: Seq[DataType])*{color} if found , everytime , when i show {*outputTypes: Seq[DataType]*}*, it is different like this:* *at first time:* DoubleType IntegerType StringType *maby second or third time:* DoubleType StringType IntegerType The order of datatype(schema) is wrong, so when it execute {mutableRow(i) = converters(i)(r(i))} will get a wrong converter , and scala's matcherror will occured.. > dataset.show() , scala.MatchError: 23.25 (of class java.lang.Double) > -------------------------------------------------------------------- > > Key: SPARK-25723 > URL: https://issues.apache.org/jira/browse/SPARK-25723 > Project: Spark > Issue Type: Question > Components: SQL > Affects Versions: 2.3.2 > Environment: local mode > Reporter: huanghuai > Priority: Major > > {color:#333333}*spark.read()*{color} > {color:#333333}*.format("com.myself.datasource")*{color} > {color:#333333}*.option("ur","xxxx")*{color} > {color:#333333}*.load()*{color} > {color:#FF0000}*.show()*{color} > > {color:#FF0000}*Driver stacktrace:*{color} > {color:#FF0000} *at*{color} > org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) > ~[scala-library-2.11.8.jar:?] > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) > ~[scala-library-2.11.8.jar:?] > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at scala.Option.foreach(Option.scala:257) ~[scala-library-2.11.8.jar:?] > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3272) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.head(Dataset.scala:2484) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.take(Dataset.scala:2698) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.showString(Dataset.scala:254) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.show(Dataset.scala:723) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.show(Dataset.scala:682) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.sql.Dataset.show(Dataset.scala:691) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > {color:#FF0000}*Caused by: scala.MatchError: 23.25 (of class > java.lang.Double)*{color} > {color:#FF0000} *at*{color} > org.apache.spark.sql.catalyst.CatalystTypeConverters$StringConverter$.toCatalystImpl(CatalystTypeConverters.scala:276) > ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.catalyst.CatalystTypeConverters$StringConverter$.toCatalystImpl(CatalystTypeConverters.scala:275) > ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.catalyst.CatalystTypeConverters$CatalystTypeConverter.toCatalyst(CatalystTypeConverters.scala:103) > ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.catalyst.CatalystTypeConverters$$anonfun$createToCatalystConverter$2.apply(CatalystTypeConverters.scala:379) > ~[spark-catalyst_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$1$$anonfun$apply$3.apply(ExistingRDD.scala:60) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.RDDConversions$$anonfun$rowToRowRdd$1$$anonfun$apply$3.apply(ExistingRDD.scala:57) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at scala.collection.Iterator$$anon$11.next(Iterator.scala:409) > ~[scala-library-2.11.8.jar:?] > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown > Source) ~[?:?] > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:253) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.sql.execution.SparkPlan$$anonfun$2.apply(SparkPlan.scala:247) > ~[spark-sql_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at > org.apache.spark.rdd.RDD$$anonfun$mapPartitionsInternal$1$$anonfun$apply$25.apply(RDD.scala:830) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.rdd.RDD.iterator(RDD.scala:288) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.scheduler.Task.run(Task.scala:109) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345) > ~[spark-core_2.11-2.3.0.jar:2.3.0] > ... 3 more > > --------------------------------------------------------------------------------------------------------------- > if i use dataset.show() this error will occured. > if i use dataset.collectAsList() ,it's ok. > I debug at follwing method, > {color:#FF0000}*org.apache.spark.sql.execution.RDDConversions.*{color} > {color:#FF0000}*rowToRowRdd(data: RDD[Row], outputTypes: > Seq[DataType])*{color} > if found , everytime , when i show {*outputTypes: Seq[DataType]*}*, it is > different like this:* > *at first time:* > DoubleType > IntegerType > StringType > *maby second or third time:* > DoubleType > StringType > IntegerType > > The order of datatype(schema) is wrong, so when it execute > {mutableRow(i) = converters(i)(r(i))} > will get a wrong converter , and scala's matcherror will occured.. -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org