[ https://issues.apache.org/jira/browse/SPARK-40916?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Erik Krogen updated SPARK-40916: -------------------------------- Description: {code:sql} select t22.uid, from ( SELECT code, count(distinct uid) cnt FROM ( SELECT uid, code, lng, lat FROM ( select riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8) as code, uid, lng, lat, dt as event_time from ( select param['timestamp'] as dt, get_json_object(get_json_object(param['input'],'$.baseInfo'),'$.uid') uid, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lng') lng, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lat') lat from manhattan_ods.ods_log_manhattan_fbi_workflow_result_log and get_json_object(get_json_object(param['input'],'$.bizExtents'),'$.productId')='2001' )a and lng is not null and lat is not null ) t2 group by uid,code,lng,lat ) t1 GROUP BY code having count(DISTINCT uid)>=10 )t11 join ( SELECT uid, code, lng, lat FROM ( select riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8) as code, uid, lng, lat, dt as event_time from ( select param['timestamp'] as dt, get_json_object(get_json_object(param['input'],'$.baseInfo'),'$.uid') uid, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lng') lng, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lat') lat from manhattan_ods.ods_log_manhattan_fbi_workflow_result_log and get_json_object(get_json_object(param['input'],'$.bizExtents'),'$.productId')='2001' )a and lng is not null and lat is not null ) t2 where substr(code,0,6)<>'wx4ey3' group by uid,code,lng,lat ) t22 on t11.code=t22.code group by t22.uid {code} this sql can't run because `riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8)` will throw npe(`Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public java.lang.String com.xiaoju.automarket.GeohashEncode.evaluate(java.lang.Double,java.lang.Double,java.lang.Integer) with arguments {null,null,8}:null`), but I have filter null in my condition, the udf of manhattan_dw.aes_decode will return null if lng or lat is null, *but after I remove `where substr(code,0,6)<>'wx4ey3' `this condition, it can run normally.* complete : Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public java.lang.String com.xiaoju.automarket.GeohashEncode.evaluate(java.lang.Double,java.lang.Double,java.lang.Integer) with arguments {null,null,8}:null at org.apache.hadoop.hive.ql.exec.FunctionRegistry.invoke(FunctionRegistry.java:1049) at org.apache.spark.sql.hive.HiveSimpleUDF.eval(hiveUDFs.scala:102) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_3$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown Source) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:275) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:274) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:515) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) was: ``` select t22.uid, from ( SELECT code, count(distinct uid) cnt FROM ( SELECT uid, code, lng, lat FROM ( select riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8) as code, uid, lng, lat, dt as event_time from ( select param['timestamp'] as dt, get_json_object(get_json_object(param['input'],'$.baseInfo'),'$.uid') uid, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lng') lng, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lat') lat from manhattan_ods.ods_log_manhattan_fbi_workflow_result_log and get_json_object(get_json_object(param['input'],'$.bizExtents'),'$.productId')='2001' )a and lng is not null and lat is not null ) t2 group by uid,code,lng,lat ) t1 GROUP BY code having count(DISTINCT uid)>=10 )t11 join ( SELECT uid, code, lng, lat FROM ( select riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8) as code, uid, lng, lat, dt as event_time from ( select param['timestamp'] as dt, get_json_object(get_json_object(param['input'],'$.baseInfo'),'$.uid') uid, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lng') lng, get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lat') lat from manhattan_ods.ods_log_manhattan_fbi_workflow_result_log and get_json_object(get_json_object(param['input'],'$.bizExtents'),'$.productId')='2001' )a and lng is not null and lat is not null ) t2 where substr(code,0,6)<>'wx4ey3' group by uid,code,lng,lat ) t22 on t11.code=t22.code group by t22.uid ``` this sql can't run because `riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8)` will throw npe(`Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public java.lang.String com.xiaoju.automarket.GeohashEncode.evaluate(java.lang.Double,java.lang.Double,java.lang.Integer) with arguments {null,null,8}:null`), but I have filter null in my condition, the udf of manhattan_dw.aes_decode will return null if lng or lat is null, *but after I remove `where substr(code,0,6)<>'wx4ey3' `this condition, it can run normally.* complete : Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to execute method public java.lang.String com.xiaoju.automarket.GeohashEncode.evaluate(java.lang.Double,java.lang.Double,java.lang.Integer) with arguments {null,null,8}:null at org.apache.hadoop.hive.ql.exec.FunctionRegistry.invoke(FunctionRegistry.java:1049) at org.apache.spark.sql.hive.HiveSimpleUDF.eval(hiveUDFs.scala:102) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_3$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown Source) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:275) at org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:274) at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:515) at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source) > udf could not filter null value cause npe > ----------------------------------------- > > Key: SPARK-40916 > URL: https://issues.apache.org/jira/browse/SPARK-40916 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.2.0 > Environment: spark3.2.0 > hadoop2.7.3 > hive2.3.9 > Reporter: jingxiong zhong > Priority: Critical > > {code:sql} > select > t22.uid, > from > ( > SELECT > code, > count(distinct uid) cnt > FROM > ( > SELECT > uid, > code, > lng, > lat > FROM > ( > select > > riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8) > as code, > uid, > lng, > lat, > dt as event_time > from > ( > select > param['timestamp'] as dt, > > get_json_object(get_json_object(param['input'],'$.baseInfo'),'$.uid') uid, > > get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lng') lng, > > get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lat') lat > from manhattan_ods.ods_log_manhattan_fbi_workflow_result_log > and > get_json_object(get_json_object(param['input'],'$.bizExtents'),'$.productId')='2001' > > )a > and lng is not null > and lat is not null > ) t2 > group by uid,code,lng,lat > ) t1 > GROUP BY code having count(DISTINCT uid)>=10 > )t11 > join > ( > SELECT > uid, > code, > lng, > lat > FROM > ( > select > > riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8) > as code, > uid, > lng, > lat, > dt as event_time > from > ( > select > param['timestamp'] as dt, > > get_json_object(get_json_object(param['input'],'$.baseInfo'),'$.uid') uid, > > get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lng') lng, > > get_json_object(get_json_object(param['input'],'$.envInfo'),'$.lat') lat > from manhattan_ods.ods_log_manhattan_fbi_workflow_result_log > and > get_json_object(get_json_object(param['input'],'$.bizExtents'),'$.productId')='2001' > > )a > and lng is not null > and lat is not null > ) t2 > where substr(code,0,6)<>'wx4ey3' > group by uid,code,lng,lat > ) t22 on t11.code=t22.code > group by t22.uid > {code} > this sql can't run because > `riskmanage_dw.GEOHASH_ENCODE(manhattan_dw.aes_decode(lng),manhattan_dw.aes_decode(lat),8)` > will throw npe(`Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: > Unable to execute method public java.lang.String > com.xiaoju.automarket.GeohashEncode.evaluate(java.lang.Double,java.lang.Double,java.lang.Integer) > with arguments {null,null,8}:null`), but I have filter null in my condition, > the udf of manhattan_dw.aes_decode will return null if lng or lat is null, > *but after I remove `where substr(code,0,6)<>'wx4ey3' `this condition, it can > run normally.* > complete : > Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Unable to > execute method public java.lang.String > com.xiaoju.automarket.GeohashEncode.evaluate(java.lang.Double,java.lang.Double,java.lang.Integer) > with arguments {null,null,8}:null > at > org.apache.hadoop.hive.ql.exec.FunctionRegistry.invoke(FunctionRegistry.java:1049) > at org.apache.spark.sql.hive.HiveSimpleUDF.eval(hiveUDFs.scala:102) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.subExpr_3$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificPredicate.eval(Unknown > Source) > at > org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3(basicPhysicalOperators.scala:275) > at > org.apache.spark.sql.execution.FilterExec.$anonfun$doExecute$3$adapted(basicPhysicalOperators.scala:274) > at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:515) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown > Source) -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org