[ 
https://issues.apache.org/jira/browse/HIVE-503?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12712641#action_12712641
 ] 

Min Zhou commented on HIVE-503:
-------------------------------

consider about a real world example

Approach 1: 
{code:sql}
set hive.groupby.skewindata=false;

create table rslt_tbg_pvuv (tbg STRING, memeber_pv int, guest_pv int, 
memeber_uv int, guest_uv int);
INSERT OVERWRITE TABLE rslt_tbg_pvuv 
SELECT
  tbg,
  cast(sum(CASE WHEN is_member=0 THEN 1 ELSE 0 END) AS int) AS member_pv,
  cast(sum(CASE WHEN is_member=1 THEN 1 ELSE 0 END) AS int) AS guest_pv,
  count(distinct CASE WHEN is_member=0 THEN id END) AS member_uv,
  count(distinct CASE WHEN is_member=1 THEN id END) AS guest_uv
FROM 
  (SELECT
    regexp_extract(url, 'http://.*TBG=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*', 1) 
as tbg,
    if(uid <> '-', 0, 1) as is_member,
    if(uid <> '-', uid, mid) as id
  FROM
    web_log
  WHERE
    (  url rlike 'http://.*TBG=1.3.1.1'
    OR url rlike 'http://.*TBG=1.3.2.2'
    OR url rlike 'http://.*TBG=1.3.3.3'
    OR url rlike 'http://.*TBG=1.3.4.4'
    OR url rlike 'http://.*TBG=1.3.5.5'
    OR url rlike 'http://.*TBG=1.3.6.6'
    OR url rlike 'http://.*TBG=1.3.7.7' )
    AND refer like '/1.gif?%'
    AND logdate='20090510')
GROUP BY tbg;
{code}

Approach 2:
{code:sql}
set hive.groupby.skewindata=false;

create table tmp_tbg_pvuv_1 (tbg STRING, is_member tinyint, id STRING);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_1
SELECT
  regexp_extract(url, 'http://.*TBG=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*', 1) as 
tbg,
  if(uid <> '-', 0, 1) as is_member,
  if(uid <> '-', uid, mid) as id
FROM
  web_log
WHERE
  (  url rlike 'http://.*TBG=1.3.1.1'
  OR url rlike 'http://.*TBG=1.3.2.2'
  OR url rlike 'http://.*TBG=1.3.3.3'
  OR url rlike 'http://.*TBG=1.3.4.4'
  OR url rlike 'http://.*TBG=1.3.5.5'
  OR url rlike 'http://.*TBG=1.3.6.6'
  OR url rlike 'http://.*TBG=1.3.7.7' )
 AND refer like '/1.gif?%'
 AND logdate='20090510';

create table tmp_tbg_pvuv_2 (tbg STRING, member_pv int, guest_pv int);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_2
SELECT
  tbg,
  cast(sum(CASE WHEN is_member=0 THEN 1 ELSE 0 END) AS int) AS member_pv,
  cast(sum(CASE WHEN is_member=1 THEN 1 ELSE 0 END) AS int) AS guest_pv
FROM 
 tmp_tbg_pvuv_1
GROUP BY tbg

create table tmp_tbg_pvuv_3 (tbg STRING, member_uv int);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_3
SELECT
  tbg,
  count(distinct CASE WHEN is_member=0 THEN id ELSE 'NULL' END) - 1 AS member_uv
FROM 
 tmp_tbg_pvuv_1
GROUP BY tbg

create table tmp_tbg_pvuv_4 (tbg STRING, guest_uv int);
INSERT OVERWRITE TABLE tmp_tbg_pvuv_4
SELECT
  tbg,
  count(distinct CASE WHEN is_member=1 THEN id ELSE 'NULL' END) - 1 AS guest_uv
FROM 
 tmp_tbg_pvuv_1
GROUP BY tbg

create table rslt_tbg_pvuv (tbg STRING, memeber_pv int, guest_pv int, 
memeber_uv int, guest_uv int);
INSERT OVERWRITE TABLE rslt_tbg_pvuv
SELECT
  t2.tbg,
  member_pv,
  guest_pv,
  member_uv,
  guest_uv
FROM
    tmp_tbg_pvuv_2 t2 
    join tmp_tbg_pvuv_3 t3 on t2.tbg=t3.tbg
    join tmp_tbg_pvuv_4 t4 on t2.tbg=t4.tbg
{code}

Do you insist the second approach is faster than the first one?


> improvement on distinct: distinguish distinct aggregate function from distinct
> ------------------------------------------------------------------------------
>
>                 Key: HIVE-503
>                 URL: https://issues.apache.org/jira/browse/HIVE-503
>             Project: Hadoop Hive
>          Issue Type: Improvement
>            Reporter: Min Zhou
>
> h4.distinct
> # OK
> {code:sql}
> select 
>    distinct col
> from 
>   tbl
> {code}
> # FAILED
> {code:sql}
> select 
>    distinct  col1,
>    distinct  col2
> from 
>   tbl
> {code}
> h4.distinct aggregate function
> # OK
> {code:sql}
> select 
>    count(distinct col % 10)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>    count(distinct col1% 10)
>    count(distinct col1% 9)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>    count(distinct col1 % 10)
>    count(distinct col2 % 9)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>   sum(distinct col1 % 10),
>   count(distinct col2 % 9)
> from 
>   tbl
> {code}
> # OK
> {code:sql}
> select 
>   max(distinct substr(col1, 1, 10)),
>   count(distinct col2 % 9)
> from 
>   tbl
> {code}
> The keyword "distinct" ofen produce more than one results, so it's impossible 
> removing two different columns' duplicates in only one mapreduce job, so it 
> failed.
> But the term "distinct aggregate function" with a form like 
> aggregate_function(distinct ....),  is in connection with the term "all 
> aggregate function",  it essentially is an aggregate function. Only one 
> result each aggregate function will produce,  it's very possible one 
> mapreduce job could deal with two or more different aggregate expression 
> simultaneously.

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

Reply via email to