[ https://issues.apache.org/jira/browse/SPARK-35688?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
todd.chen updated SPARK-35688: ------------------------------ Docs Text: code: ```scala val sparkSession = SparkSession .builder() .config(sparkConf) .getOrCreate() import sparkSession.implicits._ List( ("[b,c,d]", "{\"uid\": \"2\"}"), ("[a,b,d]", "{}"), ("?", "{}"), ("[d,b,c]", "{\"uid\": \"2\"}") ).toDF("name","json").write.mode("overwrite").parquet("/tmp/tb_eliminate_bad_case_data") sparkSession.read .parquet("/tmp/tb_eliminate_bad_case_data") .createOrReplaceTempView("tb_data") val name2Array: String => Array[String] = { input => if (input.startsWith("[") && input.endsWith("]")) { input.substring(1, input.length - 1).split(",") } else { throw new IllegalArgumentException(s"invalid input $input") } } sparkSession.udf.register("explode_name_array", name2Array) sparkSession .sql(""" |select uid ,name_array from ( |select |explode(explode_name_array(name)) as name_array, |get_json_object(json,'$.uid') as uid |from tb_data |where | name is not null and name <> '?' | ) | where uid > 0 |""".stripMargin) .show() ``` errorMsg: ``` 06-09 13:28:11 175 WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost task 1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>)06-09 13:28:11 175 WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost task 1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_0$(generated.java:155) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(generated.java:22) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:249) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:248) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:509) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:485) at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:218) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)Caused by: java.lang.IllegalArgumentException: invalid input ? ``` was: code: ```scala val sparkSession = SparkSession .builder() .config(sparkConf) .getOrCreate() import sparkSession.implicits._ List( ("[b,c,d]", "{\"uid\": \"2\"}"), ("[a,b,d]", "{}"), ("?", "{}"), ("[d,b,c]", "{\"uid\": \"2\"}") ).toDF("name","json").write.mode("overwrite").parquet("/tmp/tb_eliminate_bad_case_data") sparkSession.read .parquet("/tmp/tb_eliminate_bad_case_data") .createOrReplaceTempView("tb_data") val name2Array: String => Array[String] = { input => if (input.startsWith("[") && input.endsWith("]")) { input.substring(1, input.length - 1).split(",") } else { throw new IllegalArgumentException(s"valid input $input") } } sparkSession.udf.register("explode_name_array", name2Array) sparkSession .sql(""" |select uid ,name_array from ( |select |explode(explode_name_array(name)) as name_array, |get_json_object(json,'$.uid') as uid |from tb_data |where | name is not null and name <> '?' | ) | where uid > 0 |""".stripMargin) .show() ``` errorMsg: ``` 06-09 13:28:11 175 WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost task 1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>)06-09 13:28:11 175 WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost task 1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): org.apache.spark.SparkException: Failed to execute user defined function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_0$(generated.java:155) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(generated.java:22) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:249) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:248) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:509) at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:485) at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:218) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at org.apache.spark.scheduler.Task.run(Task.scala:131) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745)Caused by: java.lang.IllegalArgumentException: valid input ? ``` > GeneratePredicate eliminate will fail in some case > --------------------------------------------------- > > Key: SPARK-35688 > URL: https://issues.apache.org/jira/browse/SPARK-35688 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.1.1 > Reporter: todd.chen > Priority: Major > -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org