[ https://issues.apache.org/jira/browse/SPARK-30349?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
cen yuhai updated SPARK-30349: ------------------------------ Description: ```sql with tmp as( select log_date, buvid, manga_id, sum(readtime) readtime from manga.dwd_app_readtime_xt_dt where log_date >= 20191220 group by log_date, buvid, manga_id ) select t.log_date, GET_JSON_OBJECT(t.extended_fields, '$.type'), count(distinct t.buvid), count(distinct t0.buvid), count(distinct t1.buvid), count(distinct t2.buvid), count( distinct case when t1.buvid = t0.buvid then t1.buvid end ), count( distinct case when t1.buvid = t0.buvid and t1.buvid = t2.buvid then t1.buvid end ), count( distinct case when t0.buvid = t2.buvid then t0.buvid end ), sum(readtime), avg(readtime), sum( case when t0.buvid = t3.buvid then readtime end ), avg( case when t0.buvid = t3.buvid then readtime end ) from manga.manga_tfc_app_ubt_d t join manga.manga_tfc_app_ubt_d t1 on t.buvid = t1.buvid and t1.log_date >= 20191220 and t1.event_id = 'bilibili-manga.manga-detail.0.0.pv' and to_date(t.stime) = TO_DATE(t1.stime) and GET_JSON_OBJECT(t1.extended_fields, '$.manga_id') = GET_JSON_OBJECT(t.extended_fields, '$.manga_id') left join manga.manga_buvid_minlog t0 on t.buvid = t0.buvid and t0.log_date = 20191223 and t0.minlog >= '2019-12-20' and to_date(t.stime) = TO_DATE(t0.minlog) left join manga.dwb_tfc_app_launch_df t2 on t.buvid = t2.buvid and t2.log_date >= 20191220 and DATE_ADD(to_date(t.stime), 1) = to_date(t2.stime) left join tmp t3 on t1.buvid = t3.buvid and t3.log_date >= 20191220 and t3.manga_id = GET_JSON_OBJECT(t.extended_fields, '$.manga_id') where t.log_date >= 20191220 and t.event_id = 'manga.homepage-recommend.detail.0.click' group by t.log_date, GET_JSON_OBJECT(t.extended_fields, '$.type') ``` ``` was: ```sql ```sql with tmp as( select log_date, buvid, manga_id, sum(readtime) readtime from manga.dwd_app_readtime_xt_dt where log_date >= 20191220 group by log_date, buvid, manga_id ) select t.log_date, GET_JSON_OBJECT(t.extended_fields, '$.type'), count(distinct t.buvid), count(distinct t0.buvid), count(distinct t1.buvid), count(distinct t2.buvid), count( distinct case when t1.buvid = t0.buvid then t1.buvid end ), count( distinct case when t1.buvid = t0.buvid and t1.buvid = t2.buvid then t1.buvid end ), count( distinct case when t0.buvid = t2.buvid then t0.buvid end ), sum(readtime), avg(readtime), sum( case when t0.buvid = t3.buvid then readtime end ), avg( case when t0.buvid = t3.buvid then readtime end ) from manga.manga_tfc_app_ubt_d t join manga.manga_tfc_app_ubt_d t1 on t.buvid = t1.buvid and t1.log_date >= 20191220 and t1.event_id = 'bilibili-manga.manga-detail.0.0.pv' and to_date(t.stime) = TO_DATE(t1.stime) and GET_JSON_OBJECT(t1.extended_fields, '$.manga_id') = GET_JSON_OBJECT(t.extended_fields, '$.manga_id') left join manga.manga_buvid_minlog t0 on t.buvid = t0.buvid and t0.log_date = 20191223 and t0.minlog >= '2019-12-20' and to_date(t.stime) = TO_DATE(t0.minlog) left join manga.dwb_tfc_app_launch_df t2 on t.buvid = t2.buvid and t2.log_date >= 20191220 and DATE_ADD(to_date(t.stime), 1) = to_date(t2.stime) left join tmp t3 on t1.buvid = t3.buvid and t3.log_date >= 20191220 and t3.manga_id = GET_JSON_OBJECT(t.extended_fields, '$.manga_id') where t.log_date >= 20191220 and t.event_id = 'manga.homepage-recommend.detail.0.click' group by t.log_date, GET_JSON_OBJECT(t.extended_fields, '$.type') ``` ``` > The result is wrong when joining tables with selecting the same columns > ----------------------------------------------------------------------- > > Key: SPARK-30349 > URL: https://issues.apache.org/jira/browse/SPARK-30349 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.4.4 > Environment: hadoop-2.8.4 spark 2.4.4 > Reporter: cen yuhai > Priority: Major > > ```sql > with tmp as( > select > log_date, > buvid, > manga_id, > sum(readtime) readtime > from > manga.dwd_app_readtime_xt_dt > where > log_date >= 20191220 > group by > log_date, > buvid, > manga_id > ) > select > t.log_date, > GET_JSON_OBJECT(t.extended_fields, '$.type'), > count(distinct t.buvid), > count(distinct t0.buvid), > count(distinct t1.buvid), > count(distinct t2.buvid), > count( > distinct case > when t1.buvid = t0.buvid then t1.buvid > end > ), > count( > distinct case > when t1.buvid = t0.buvid > and t1.buvid = t2.buvid then t1.buvid > end > ), > count( > distinct case > when t0.buvid = t2.buvid then t0.buvid > end > ), > sum(readtime), > avg(readtime), > sum( > case > when t0.buvid = t3.buvid then readtime > end > ), > avg( > case > when t0.buvid = t3.buvid then readtime > end > ) > from > manga.manga_tfc_app_ubt_d t > join manga.manga_tfc_app_ubt_d t1 on t.buvid = t1.buvid > and t1.log_date >= 20191220 > and t1.event_id = 'bilibili-manga.manga-detail.0.0.pv' > and to_date(t.stime) = TO_DATE(t1.stime) > and GET_JSON_OBJECT(t1.extended_fields, '$.manga_id') = > GET_JSON_OBJECT(t.extended_fields, '$.manga_id') > left join manga.manga_buvid_minlog t0 on t.buvid = t0.buvid > and t0.log_date = 20191223 > and t0.minlog >= '2019-12-20' > and to_date(t.stime) = TO_DATE(t0.minlog) > left join manga.dwb_tfc_app_launch_df t2 on t.buvid = t2.buvid > and t2.log_date >= 20191220 > and DATE_ADD(to_date(t.stime), 1) = to_date(t2.stime) > left join tmp t3 on t1.buvid = t3.buvid > and t3.log_date >= 20191220 > and t3.manga_id = GET_JSON_OBJECT(t.extended_fields, '$.manga_id') > where > t.log_date >= 20191220 > and t.event_id = 'manga.homepage-recommend.detail.0.click' > group by > t.log_date, > GET_JSON_OBJECT(t.extended_fields, '$.type') > ``` > ``` -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org