[ https://issues.apache.org/jira/browse/HIVE-503?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12712641#action_12712641 ]
Min Zhou commented on HIVE-503: ------------------------------- consider about a real world example Approach 1: {code:sql} set hive.groupby.skewindata=false; create table rslt_tbg_pvuv (tbg STRING, memeber_pv int, guest_pv int, memeber_uv int, guest_uv int); INSERT OVERWRITE TABLE rslt_tbg_pvuv SELECT tbg, cast(sum(CASE WHEN is_member=0 THEN 1 ELSE 0 END) AS int) AS member_pv, cast(sum(CASE WHEN is_member=1 THEN 1 ELSE 0 END) AS int) AS guest_pv, count(distinct CASE WHEN is_member=0 THEN id END) AS member_uv, count(distinct CASE WHEN is_member=1 THEN id END) AS guest_uv FROM (SELECT regexp_extract(url, 'http://.*TBG=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*', 1) as tbg, if(uid <> '-', 0, 1) as is_member, if(uid <> '-', uid, mid) as id FROM web_log WHERE ( url rlike 'http://.*TBG=1.3.1.1' OR url rlike 'http://.*TBG=1.3.2.2' OR url rlike 'http://.*TBG=1.3.3.3' OR url rlike 'http://.*TBG=1.3.4.4' OR url rlike 'http://.*TBG=1.3.5.5' OR url rlike 'http://.*TBG=1.3.6.6' OR url rlike 'http://.*TBG=1.3.7.7' ) AND refer like '/1.gif?%' AND logdate='20090510') GROUP BY tbg; {code} Approach 2: {code:sql} set hive.groupby.skewindata=false; create table tmp_tbg_pvuv_1 (tbg STRING, is_member tinyint, id STRING); INSERT OVERWRITE TABLE tmp_tbg_pvuv_1 SELECT regexp_extract(url, 'http://.*TBG=([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+).*', 1) as tbg, if(uid <> '-', 0, 1) as is_member, if(uid <> '-', uid, mid) as id FROM web_log WHERE ( url rlike 'http://.*TBG=1.3.1.1' OR url rlike 'http://.*TBG=1.3.2.2' OR url rlike 'http://.*TBG=1.3.3.3' OR url rlike 'http://.*TBG=1.3.4.4' OR url rlike 'http://.*TBG=1.3.5.5' OR url rlike 'http://.*TBG=1.3.6.6' OR url rlike 'http://.*TBG=1.3.7.7' ) AND refer like '/1.gif?%' AND logdate='20090510'; create table tmp_tbg_pvuv_2 (tbg STRING, member_pv int, guest_pv int); INSERT OVERWRITE TABLE tmp_tbg_pvuv_2 SELECT tbg, cast(sum(CASE WHEN is_member=0 THEN 1 ELSE 0 END) AS int) AS member_pv, cast(sum(CASE WHEN is_member=1 THEN 1 ELSE 0 END) AS int) AS guest_pv FROM tmp_tbg_pvuv_1 GROUP BY tbg create table tmp_tbg_pvuv_3 (tbg STRING, member_uv int); INSERT OVERWRITE TABLE tmp_tbg_pvuv_3 SELECT tbg, count(distinct CASE WHEN is_member=0 THEN id ELSE 'NULL' END) - 1 AS member_uv FROM tmp_tbg_pvuv_1 GROUP BY tbg create table tmp_tbg_pvuv_4 (tbg STRING, guest_uv int); INSERT OVERWRITE TABLE tmp_tbg_pvuv_4 SELECT tbg, count(distinct CASE WHEN is_member=1 THEN id ELSE 'NULL' END) - 1 AS guest_uv FROM tmp_tbg_pvuv_1 GROUP BY tbg create table rslt_tbg_pvuv (tbg STRING, memeber_pv int, guest_pv int, memeber_uv int, guest_uv int); INSERT OVERWRITE TABLE rslt_tbg_pvuv SELECT t2.tbg, member_pv, guest_pv, member_uv, guest_uv FROM tmp_tbg_pvuv_2 t2 join tmp_tbg_pvuv_3 t3 on t2.tbg=t3.tbg join tmp_tbg_pvuv_4 t4 on t2.tbg=t4.tbg {code} Do you insist the second approach is faster than the first one? > improvement on distinct: distinguish distinct aggregate function from distinct > ------------------------------------------------------------------------------ > > Key: HIVE-503 > URL: https://issues.apache.org/jira/browse/HIVE-503 > Project: Hadoop Hive > Issue Type: Improvement > Reporter: Min Zhou > > h4.distinct > # OK > {code:sql} > select > distinct col > from > tbl > {code} > # FAILED > {code:sql} > select > distinct col1, > distinct col2 > from > tbl > {code} > h4.distinct aggregate function > # OK > {code:sql} > select > count(distinct col % 10) > from > tbl > {code} > # OK > {code:sql} > select > count(distinct col1% 10) > count(distinct col1% 9) > from > tbl > {code} > # OK > {code:sql} > select > count(distinct col1 % 10) > count(distinct col2 % 9) > from > tbl > {code} > # OK > {code:sql} > select > sum(distinct col1 % 10), > count(distinct col2 % 9) > from > tbl > {code} > # OK > {code:sql} > select > max(distinct substr(col1, 1, 10)), > count(distinct col2 % 9) > from > tbl > {code} > The keyword "distinct" ofen produce more than one results, so it's impossible > removing two different columns' duplicates in only one mapreduce job, so it > failed. > But the term "distinct aggregate function" with a form like > aggregate_function(distinct ....), is in connection with the term "all > aggregate function", it essentially is an aggregate function. Only one > result each aggregate function will produce, it's very possible one > mapreduce job could deal with two or more different aggregate expression > simultaneously. -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online.