[ 
https://issues.apache.org/jira/browse/SPARK-35688?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

todd.chen updated SPARK-35688:
------------------------------
    Docs Text: 
code: 

```scala

val sparkSession = SparkSession
.builder()
.config(sparkConf)
.getOrCreate()

import sparkSession.implicits._

List(
("[b,c,d]", "{\"uid\": \"2\"}"),
("[a,b,d]", "{}"),
("?", "{}"),
("[d,b,c]", "{\"uid\": \"2\"}")
).toDF("name","json").write.mode("overwrite").parquet("/tmp/tb_eliminate_bad_case_data")
sparkSession.read
.parquet("/tmp/tb_eliminate_bad_case_data")
.createOrReplaceTempView("tb_data")

val name2Array: String => Array[String] = { input =>
if (input.startsWith("[") && input.endsWith("]")) {
input.substring(1, input.length - 1).split(",")
} else {
throw new IllegalArgumentException(s"invalid input $input")
}
}

sparkSession.udf.register("explode_name_array", name2Array)

sparkSession
.sql("""
|select uid ,name_array from (
|select
|explode(explode_name_array(name)) as name_array,
|get_json_object(json,'$.uid') as uid
|from tb_data
|where
| name is not null and name <> '?'
| )
| where uid > 0
|""".stripMargin)
.show()

```



errorMsg: 

```

06-09 13:28:11 175  WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost 
task 1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): 
org.apache.spark.SparkException: Failed to execute user defined 
function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>)06-09 
13:28:11 175  WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost task 
1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): 
org.apache.spark.SparkException: Failed to execute user defined 
function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>) at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_0$(generated.java:155)
 at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(generated.java:22)
 at 
org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:249)
 at 
org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:248)
 at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:509) at 
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:485) at 
scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:218) at 
scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at 
scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at 
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
 at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
 at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at 
org.apache.spark.scheduler.Task.run(Task.scala:131) at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
 at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745)Caused by: 
java.lang.IllegalArgumentException: invalid input ?

```

  was:
code: 

```scala

val sparkSession = SparkSession
.builder()
.config(sparkConf)
.getOrCreate()

import sparkSession.implicits._

List(
("[b,c,d]", "{\"uid\": \"2\"}"),
("[a,b,d]", "{}"),
("?", "{}"),
("[d,b,c]", "{\"uid\": \"2\"}")
).toDF("name","json").write.mode("overwrite").parquet("/tmp/tb_eliminate_bad_case_data")
sparkSession.read
.parquet("/tmp/tb_eliminate_bad_case_data")
.createOrReplaceTempView("tb_data")

val name2Array: String => Array[String] = { input =>
if (input.startsWith("[") && input.endsWith("]")) {
input.substring(1, input.length - 1).split(",")
} else {
throw new IllegalArgumentException(s"valid input $input")
}
}

sparkSession.udf.register("explode_name_array", name2Array)

sparkSession
.sql("""
|select uid ,name_array from (
|select
|explode(explode_name_array(name)) as name_array,
|get_json_object(json,'$.uid') as uid
|from tb_data
|where
| name is not null and name <> '?'
| )
| where uid > 0
|""".stripMargin)
.show()

```



errorMsg: 

```

06-09 13:28:11 175  WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost 
task 1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): 
org.apache.spark.SparkException: Failed to execute user defined 
function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>)06-09 
13:28:11 175  WARN (org.apache.spark.scheduler.TaskSetManager:69) - Lost task 
1.0 in stage 3.0 (TID 6) (192.168.112.64 executor driver): 
org.apache.spark.SparkException: Failed to execute user defined 
function(NewDebug$$$Lambda$2695/852243673: (string) => array<string>) at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_0$(generated.java:155)
 at 
org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(generated.java:22)
 at 
org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:249)
 at 
org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:248)
 at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:509) at 
scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:485) at 
scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:218) at 
scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at 
scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:454) at 
org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
 at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898) at 
org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
 at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52) at 
org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373) at 
org.apache.spark.rdd.RDD.iterator(RDD.scala:337) at 
org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90) at 
org.apache.spark.scheduler.Task.run(Task.scala:131) at 
org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
 at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439) at 
org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500) at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) 
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) 
at java.lang.Thread.run(Thread.java:745)Caused by: 
java.lang.IllegalArgumentException: valid input ?

```


> GeneratePredicate eliminate will fail  in some case
> ---------------------------------------------------
>
>                 Key: SPARK-35688
>                 URL: https://issues.apache.org/jira/browse/SPARK-35688
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 3.1.1
>            Reporter: todd.chen
>            Priority: Major
>




--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to