[jira] [Updated] (SPARK-30349) The result is wrong when joining tables with selecting the same columns

cen yuhai (Jira) Tue, 24 Dec 2019 19:08:27 -0800


     [ 
https://issues.apache.org/jira/browse/SPARK-30349?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


cen yuhai updated SPARK-30349:
------------------------------
    Description: 
```sql

with tmp as(
 select
 log_date,
 buvid,
 manga_id,
 sum(readtime) readtime
 from
 manga.dwd_app_readtime_xt_dt
 where
 log_date >= 20191220
 group by
 log_date,
 buvid,
 manga_id
 )
 select
 t.log_date,
 GET_JSON_OBJECT(t.extended_fields, '$.type'),
 count(distinct t.buvid),
 count(distinct t0.buvid),
 count(distinct t1.buvid),
 count(distinct t2.buvid),
 count(
 distinct case
 when t1.buvid = t0.buvid then t1.buvid
 end
 ),
 count(
 distinct case
 when t1.buvid = t0.buvid
 and t1.buvid = t2.buvid then t1.buvid
 end
 ),
 count(
 distinct case
 when t0.buvid = t2.buvid then t0.buvid
 end
 ),
 sum(readtime),
 avg(readtime),
 sum(
 case
 when t0.buvid = t3.buvid then readtime
 end
 ),
 avg(
 case
 when t0.buvid = t3.buvid then readtime
 end
 )
 from
 manga.manga_tfc_app_ubt_d t
 join manga.manga_tfc_app_ubt_d t1 on t.buvid = t1.buvid
 and t1.log_date >= 20191220
 and t1.event_id = 'bilibili-manga.manga-detail.0.0.pv'
 and to_date(t.stime) = TO_DATE(t1.stime)
 and GET_JSON_OBJECT(t1.extended_fields, '$.manga_id') = 
GET_JSON_OBJECT(t.extended_fields, '$.manga_id')
 left join manga.manga_buvid_minlog t0 on t.buvid = t0.buvid
 and t0.log_date = 20191223
 and t0.minlog >= '2019-12-20'
 and to_date(t.stime) = TO_DATE(t0.minlog)
 left join manga.dwb_tfc_app_launch_df t2 on t.buvid = t2.buvid
 and t2.log_date >= 20191220
 and DATE_ADD(to_date(t.stime), 1) = to_date(t2.stime)
 left join tmp t3 on t1.buvid = t3.buvid
 and t3.log_date >= 20191220
 and t3.manga_id = GET_JSON_OBJECT(t.extended_fields, '$.manga_id')
 where
 t.log_date >= 20191220
 and t.event_id = 'manga.homepage-recommend.detail.0.click'
 group by
 t.log_date,
 GET_JSON_OBJECT(t.extended_fields, '$.type')

```

```

  was:
```sql

 

 

 

```sql

with tmp as(
 select
 log_date,
 buvid,
 manga_id,
 sum(readtime) readtime
 from
 manga.dwd_app_readtime_xt_dt
 where
 log_date >= 20191220
 group by
 log_date,
 buvid,
 manga_id
 )
 select
 t.log_date,
 GET_JSON_OBJECT(t.extended_fields, '$.type'),
 count(distinct t.buvid),
 count(distinct t0.buvid),
 count(distinct t1.buvid),
 count(distinct t2.buvid),
 count(
 distinct case
 when t1.buvid = t0.buvid then t1.buvid
 end
 ),
 count(
 distinct case
 when t1.buvid = t0.buvid
 and t1.buvid = t2.buvid then t1.buvid
 end
 ),
 count(
 distinct case
 when t0.buvid = t2.buvid then t0.buvid
 end
 ),
 sum(readtime),
 avg(readtime),
 sum(
 case
 when t0.buvid = t3.buvid then readtime
 end
 ),
 avg(
 case
 when t0.buvid = t3.buvid then readtime
 end
 )
 from
 manga.manga_tfc_app_ubt_d t
 join manga.manga_tfc_app_ubt_d t1 on t.buvid = t1.buvid
 and t1.log_date >= 20191220
 and t1.event_id = 'bilibili-manga.manga-detail.0.0.pv'
 and to_date(t.stime) = TO_DATE(t1.stime)
 and GET_JSON_OBJECT(t1.extended_fields, '$.manga_id') = 
GET_JSON_OBJECT(t.extended_fields, '$.manga_id')
 left join manga.manga_buvid_minlog t0 on t.buvid = t0.buvid
 and t0.log_date = 20191223
 and t0.minlog >= '2019-12-20'
 and to_date(t.stime) = TO_DATE(t0.minlog)
 left join manga.dwb_tfc_app_launch_df t2 on t.buvid = t2.buvid
 and t2.log_date >= 20191220
 and DATE_ADD(to_date(t.stime), 1) = to_date(t2.stime)
 left join tmp t3 on t1.buvid = t3.buvid
 and t3.log_date >= 20191220
 and t3.manga_id = GET_JSON_OBJECT(t.extended_fields, '$.manga_id')
 where
 t.log_date >= 20191220
 and t.event_id = 'manga.homepage-recommend.detail.0.click'
 group by
 t.log_date,
 GET_JSON_OBJECT(t.extended_fields, '$.type')

```

```


> The result is wrong when joining tables with selecting the same columns
> -----------------------------------------------------------------------
>
>                 Key: SPARK-30349
>                 URL: https://issues.apache.org/jira/browse/SPARK-30349
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.4.4
>         Environment: hadoop-2.8.4 spark 2.4.4
>            Reporter: cen yuhai
>            Priority: Major
>
> ```sql
> with tmp as(
>  select
>  log_date,
>  buvid,
>  manga_id,
>  sum(readtime) readtime
>  from
>  manga.dwd_app_readtime_xt_dt
>  where
>  log_date >= 20191220
>  group by
>  log_date,
>  buvid,
>  manga_id
>  )
>  select
>  t.log_date,
>  GET_JSON_OBJECT(t.extended_fields, '$.type'),
>  count(distinct t.buvid),
>  count(distinct t0.buvid),
>  count(distinct t1.buvid),
>  count(distinct t2.buvid),
>  count(
>  distinct case
>  when t1.buvid = t0.buvid then t1.buvid
>  end
>  ),
>  count(
>  distinct case
>  when t1.buvid = t0.buvid
>  and t1.buvid = t2.buvid then t1.buvid
>  end
>  ),
>  count(
>  distinct case
>  when t0.buvid = t2.buvid then t0.buvid
>  end
>  ),
>  sum(readtime),
>  avg(readtime),
>  sum(
>  case
>  when t0.buvid = t3.buvid then readtime
>  end
>  ),
>  avg(
>  case
>  when t0.buvid = t3.buvid then readtime
>  end
>  )
>  from
>  manga.manga_tfc_app_ubt_d t
>  join manga.manga_tfc_app_ubt_d t1 on t.buvid = t1.buvid
>  and t1.log_date >= 20191220
>  and t1.event_id = 'bilibili-manga.manga-detail.0.0.pv'
>  and to_date(t.stime) = TO_DATE(t1.stime)
>  and GET_JSON_OBJECT(t1.extended_fields, '$.manga_id') = 
> GET_JSON_OBJECT(t.extended_fields, '$.manga_id')
>  left join manga.manga_buvid_minlog t0 on t.buvid = t0.buvid
>  and t0.log_date = 20191223
>  and t0.minlog >= '2019-12-20'
>  and to_date(t.stime) = TO_DATE(t0.minlog)
>  left join manga.dwb_tfc_app_launch_df t2 on t.buvid = t2.buvid
>  and t2.log_date >= 20191220
>  and DATE_ADD(to_date(t.stime), 1) = to_date(t2.stime)
>  left join tmp t3 on t1.buvid = t3.buvid
>  and t3.log_date >= 20191220
>  and t3.manga_id = GET_JSON_OBJECT(t.extended_fields, '$.manga_id')
>  where
>  t.log_date >= 20191220
>  and t.event_id = 'manga.homepage-recommend.detail.0.click'
>  group by
>  t.log_date,
>  GET_JSON_OBJECT(t.extended_fields, '$.type')
> ```
> ```



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-30349) The result is wrong when joining tables with selecting the same columns

Reply via email to