Hi All,
I have a hive query which does the aggregation of amounts by reading from
hive tables and loads the results to another hive table.
I am trying to fine tune the attached query. Read online and came up with
following. Any Ideas I would be really appreciate. Thank you
1. Indexing:- We can create Index on the tables. (some folks says
index actually make it worse)
https://community.cloudera.com/t5/Support-Questions/Creating-Indexes-in-Hive/td-p/149601
2. Execution :- Right now the hive queries run mapreduce engine. We
can set the execution engine to Tez for the improved performance (looks
like Cloudera doesn't support Tez )
https://community.cloudera.com/t5/Support-Questions/Tez-Engine-not-working-over-CDH-5-8-2/td-p/49477
3. Bucketing: improves the join performance if the bucket key and
join keys are common. Bucketing in Hive distributes the data in different
buckets based on the hash results on the bucket key. It also reduces
the I/O scans during the join process if the process is happening on the
same keys (columns).
SET hive.enforce.bucketing=true;
SET hive.optimize.bucketmapjoin=true.
4. Cost-Based Optimization in Hive (CBO)
before submitting for final execution Hive optimizes each
Query’s logical and physical execution plan
However, CBO, performs, further optimizations based on query
cost in a recent addition to Hive. That results in potentially different
decisions: how to order joins, which type of join to
perform, the degree of parallelism and others.
set hive.cbo.enable=true;
set hive.compute.query.using.stats=true;
set hive.stats.fetch.column.stats=true;
set hive.stats.fetch.partition.stats=true;
5. Vectorization In Hive
To improve the performance of operations we can use Vectorized
query execution. It happens by performing them in batches of 1024 rows at
once instead of single row each time.It
significantly improves query execution time, and is easily enabled with two
parameters settings
set hive.vectorized.execution = true
set hive.vectorized.execution.enabled = true
Bucketing parameters already set to true
Cost-Based Optimization in Hive (CBO) parameters set to true
Vectorization parameters set to true
I am not sure what else I can do to make the query work faster
--
Regards,
Rajbir
DROP TABLE IF EXISTS ${tableName};
CREATE EXTERNAL TABLE ${tableName}
ROW FORMAT
SERDE 'org.apache.hadoop.hive.serde2.avro.AvroSerDe'
STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'
TBLPROPERTIES ('avro.schema.url'='${schemaLoc}');
INSERT OVERWRITE TABLE ${tableName}
SELECT
CONCAT(COALESCE(table1.${entityId},TABLE2.${entityId}), '/',
'${conceptType}', '/', '${BALDATE}', '/', '${processedDateTime}') AS DetailUid,
CONCAT(CONCAT(COALESCE(table1.${orgid},'NULL'),'/',COALESCE(TABLE2.${orgid},'NULL')),
'/', '${conceptType}' , '/', '${BALDATE}', '/', '${processedDateTime}') AS Uid,
'${conceptType}' AS conceptType,
COALESCE(table1.${entityId}, TABLE2.${entityId}) AS entityId,
'${entityName}' AS entityName,
COALESCE(${TABLE1_FINID}, ${TABLE2_FINID}) AS FINID,
COALESCE(${TABLE1_acctid}, ${TABLE2_acctid}) AS acctid,
CAST(CAST(TABLE2.${amount} AS DECIMAL(20,2)) AS STRING) AS amount,
CAST(CAST(table1.${amount} AS DECIMAL(20,2)) AS STRING) AS ssamount,
CAST((COALESCE(CAST(table1.${amount} AS
DECIMAL(20,2)),0)-COALESCE(CAST(TABLE2.${amount} AS DECIMAL(20,2)),0)) AS
STRING) AS varamount,
COALESCE(table1.${balDate}, TABLE2.${balDate}) AS BALDATE,
COALESCE(${TABLE1_borgid}, ${TABLE2_borgid}) AS borgid,
COALESCE(${TABLE1_borg}, ${TABLE2_borg}) AS borg,
TABLE2.${facId} AS facId,
TABLE2.${fac} AS fac,
TABLE2.${orgid} AS orgid,
TABLE2.${org} AS org,
table1.${orgid} AS ssorgid,
table1.${org} AS ssorg,
${TABLE2_encounterId} AS eId,
${TABLE2_encounterNumber} AS eNumber,
${TABLE2_personId} AS pId,
table1.sd,
TABLE2.sv,
'${processedDateTimeInUTC}' AS processedDateTime,
CASE
WHEN (table1.${entityId} IS NULL OR TABLE2.${entityId} IS NULL)
THEN "T Missing"
WHEN (COALESCE(table1.${amount}, 0)-COALESCE(TABLE2.${amount}, 0))
!= 0 THEN "A Mismatch"
WHEN table1.${orgid} != TABLE2.${orgid} THEN "SO Mismatch"
WHEN table1.${orgid} IS NULL OR TABLE2.${orgid} IS NULL THEN "SO
Mismatch"
END varianceDescription
FROM ${ssTable} AS Table1
FULL OUTER JOIN ${hTable}_DETAIL_TEMP AS TABLE2 ON
table1.${entityId} = TABLE2.${entityId}
WHERE (COALESCE(table1.${amount}, 0)-COALESCE(TABLE2.${amount}, 0)) != 0
OR table1.${entityId} is null
OR TABLE2.${entityId} is null
OR TABLE2.${orgid} IS NULL
OR table1.${orgid} IS NULL
OR table1.${orgid} != TABLE2.${orgid}
UNION ALL
SELECT
TABLE2_DUPLICATES.DetailUid,
TABLE2_DUPLICATES.Uid,
TABLE2_DUPLICATES.conceptType,
TABLE2_DUPLICATES.entityId,
TABLE2_DUPLICATES.entityName,
TABLE2_DUPLICATES.FINID,
TABLE2_DUPLICATES.acctid,
CAST(TABLE2_DUPLICATES.amount AS STRING) AS amount,
NULL AS ssamount,
CAST(TABLE2_DUPLICATES.varamount AS STRING) AS varamount,
TABLE2_DUPLICATES.BALDATE,
TABLE2_DUPLICATES.borgid,
TABLE2_DUPLICATES.borg,
TABLE2_DUPLICATES.facId,
TABLE2_DUPLICATES.fac,
TABLE2_DUPLICATES.orgid,
TABLE2_DUPLICATES.org,
NULL AS ssorgid,
NULL AS ssorg,
TABLE2_DUPLICATES.eId,
TABLE2_DUPLICATES.eNumber,
TABLE2_DUPLICATES.pId,
TABLE2_DUPLICATES.sd,
TABLE2_DUPLICATES.sv,
TABLE2_DUPLICATES.processedDateTime,
"DTH" AS varianceDescription
FROM (
SELECT
CONCAT(TABLE2.${entityId}, '/', '${conceptType}', '/',
'${BALDATE}', '/', '${processedDateTime}') AS DetailUid,
CONCAT(CONCAT('NULL','/',COALESCE(TABLE2.orgid,'NULL')), '/',
'${conceptType}' , '/', '${BALDATE}', '/', '${processedDateTime}') AS Uid,
'${conceptType}' AS conceptType,
TABLE2.${entityId} AS entityId,
'${entityName}' AS entityName,
TABLE2.FINID AS FINID,
TABLE2.acctid AS acctid,
CAST(TABLE2.${amount} AS DECIMAL(20,2)) AS amount,
NULL AS ssamount,
CAST((TABLE2.${amount}*-1) AS DECIMAL(20,2)) AS varamount,
TABLE2.${balanceDate} AS BALDATE,
${TABLE2_borgid} AS borgid,
${TABLE2_borg} AS borg,
TABLE2.${facId} AS facId,
TABLE2.${fac} AS fac,
TABLE2.${orgid} AS orgid,
TABLE2.${org} AS org,
NULL AS ssorgid,
NULL AS ssorg,
${TABLE2_eId} AS eId,
${TABLE2_enumber} AS enumber,
${TABLE2_pid} AS pId,
TABLE2.sd AS sd,
TABLE2.sv AS sv,
'${processedDateTimeInUTC}' AS processedDateTime,
ROW_NUMBER() OVER ( PARTITION BY TABLE2.${entityId} ) AS rank
FROM ${hTable}_DETAIL_TEMP AS HA WHERE TABLE2.${entityId}
IN(
SELECT
table3.${entityId}
FROM ${hTable}_DETAIL_TEMP AS table3
GROUP BY table3.${entityId}
HAVING COUNT(*) > 1 )
)
TABLE2_DUPLICATES WHERE rank > 1
UNION ALL
SELECT
TABLE1_DUPLICATES.DetailUid,
TABLE1_DUPLICATES.Uid,
TABLE1_DUPLICATES.conceptType,
TABLE1_DUPLICATES.entityId,
TABLE1_DUPLICATES.entityName,
TABLE1_DUPLICATES.FINID,
NULL AS acctid,
NULL AS amount,
CAST(TABLE1_DUPLICATES.ssamount AS STRING) AS ssamount,
CAST(TABLE1_DUPLICATES.varamount AS STRING) AS varamount,
TABLE1_DUPLICATES.BALDATE,
TABLE1_DUPLICATES.borgid,
TABLE1_DUPLICATES.borg,
TABLE1_DUPLICATES.facId,
TABLE1_DUPLICATES.fac,
NULL AS orgid,
NULL AS org,
TABLE1_DUPLICATES.ssorgid,
TABLE1_DUPLICATES.ssorg,
TABLE1_DUPLICATES.eid,
TABLE1_DUPLICATES.enumber,
TABLE1_DUPLICATES.pid,
TABLE1_DUPLICATES.sd,
TABLE1_DUPLICATES.sv,
TABLE1_DUPLICATES.processedDateTime,
"DTSS" AS varianceDescription
FROM (
SELECT
CONCAT(table1.${entityId}, '/', '${conceptType}', '/',
'${BALDATE}', '/', '${processedDateTime}') AS DetailUid,
CONCAT(CONCAT(COALESCE(table1.orgid,'NULL'),'/','NULL'), '/',
'${conceptType}' , '/', '${BALDATE}', '/', '${processedDateTime}') AS Uid,
'${conceptType}' AS conceptType,
table1.${entityId} AS entityId,
'${entityName}' AS entityName,
${TABLE1_FINID} AS FINID,
${TABLE1_acctid} AS acctid,
NULL AS amount,
CAST(table1.${amount} AS DECIMAL(20,2)) AS ssamount,
CAST(table1.${amount} AS DECIMAL(20,2)) AS varamount,
table1.${balanceDate} AS BALDATE,
${TABLE1_borgid} AS borgid,
${TABLE1_borg} AS borg,
table1.${facId} AS facId,
table1.${fac} AS fac,
NULL AS orgid,
NULL AS org,
table1.${orgid} AS ssorgid,
table1.${org} AS ssorg,
${TABLE1_eId} AS eid,
${TABLE1_eNumber} AS enumber,
${TABLE1_pId} AS pid,
table1.sd AS sd,
table1.sv AS sv,
'${processedDateTimeInUTC}' AS processedDateTime,
ROW_NUMBER() OVER ( PARTITION BY table1.${entityId} ) AS rank
FROM ${ssTable} AS TABLE1 WHERE table1.${entityId}
IN(
SELECT
TABLE4.${entityId}
FROM ${ssTable} AS TABLE4
GROUP BY TABLE4.${entityId}
HAVING COUNT(*) > 1 )
)
TABLE1_DUPLICATES WHERE rank > 1;