[GitHub] [hive] jcamachor commented on a change in pull request #2196: HIVE-24992: Incremental rebuild of MV having aggregate in presence of delete operation

GitBox Tue, 27 Apr 2021 23:10:53 -0700


jcamachor commented on a change in pull request #2196:
URL: https://github.com/apache/hive/pull/2196#discussion_r621840197




##########
File path: 
ql/src/test/queries/clientpositive/materialized_view_create_rewrite_6.q
##########
@@ -0,0 +1,75 @@
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+set hive.materializedview.rewriting.sql=false;
+
+create table t1(a char(15), b int, c int) stored as orc TBLPROPERTIES 
('transactional'='true');
+create table t2(a char(15), b int) stored as orc TBLPROPERTIES 
('transactional'='true');
+
+insert into t1(a, b, c) values
+('update', 1, 1), ('update', 2, 1),
+('null_update', null, 1), ('null_update', null, 2);
+insert into t1(a, b, c) values ('remove', 3, 1), ('null_remove', null, 1);
+insert into t1(a, b, c) values ('sum0', 0, 1), ('sum0', 0, 2);
+
+insert into t2(a, b) values
+('update', 10),
+('null_update', null);
+insert into t2(a, b) values ('remove', 30), ('null_remove', null);
+insert into t2(a, b) values ('sum0', 0);
+
+-- Aggregate with count(*): incremental rebuild should be triggered even if 
there were deletes from source table
+create materialized view mat1 stored as orc TBLPROPERTIES 
('transactional'='true') as
+select t1.a, sum(t1.b), count(*) from t1

Review comment:
       Can we add the corresponding test without `count(*)`, making sure that 
we fallback to full rebuild?

##########
File path: 
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/views/HiveAggregateInsertDeleteIncrementalRewritingRule.java
##########
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.rules.views;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.calcite.plan.RelOptRuleCall;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.Aggregate;
+import org.apache.calcite.rel.core.AggregateCall;
+import org.apache.calcite.rel.core.Union;
+import org.apache.calcite.rex.RexBuilder;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.sql.SqlAggFunction;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.apache.calcite.tools.RelBuilder;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
+
+import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveHepExtractRelNodeRule;
+
+/**
+ * This rule will perform a rewriting to prepare the plan for incremental
+ * view maintenance in case there exist aggregation operator, so we can
+ * avoid the INSERT OVERWRITE and use a MERGE statement instead.
+ *
+ * In particular, the INSERT OVERWRITE maintenance will look like this
+ * (in SQL):
+ * INSERT OVERWRITE mv
+ * SELECT a, b, SUM(s) as s, SUM(c) AS c
+ * FROM (
+ *   SELECT * from mv --OLD DATA
+ *   UNION ALL
+ *   SELECT a, b, SUM(x) AS s, COUNT(*) AS c --NEW DATA
+ *   FROM TAB_A
+ *   JOIN TAB_B ON (TAB_A.a = TAB_B.z)
+ *   WHERE TAB_A.ROW_ID &gt; 5
+ *   GROUP BY a, b) inner_subq
+ * GROUP BY a, b;
+ *
+ * We need to transform that into:
+ * MERGE INTO mv
+ * USING (
+ *   SELECT a, b, SUM(x) AS s, COUNT(*) AS c --NEW DATA
+ *   FROM TAB_A
+ *   JOIN TAB_B ON (TAB_A.a = TAB_B.z)
+ *   WHERE TAB_A.ROW_ID &gt; 5
+ *   GROUP BY a, b) source
+ * ON (mv.a <=> source.a AND mv.b <=> source.b)
+ * WHEN MATCHED AND mv.c + source.c &lt;&gt; 0
+ *   THEN UPDATE SET mv.s = mv.s + source.s, mv.c = mv.c + source.c
+ * WHEN NOT MATCHED
+ *   THEN INSERT VALUES (source.a, source.b, s, c);
+ * WHEN MATCHED AND countStar = 0 THEN DELETE

Review comment:
       nit. Reorder so the WHEN NOT MATCH INSERT is the last branch in the 
MERGE statement

##########
File path: 
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/views/HiveAggregateIncrementalRewritingRuleBase.java
##########
@@ -203,17 +147,42 @@ public void onMatch(RelOptRuleCall call) {
               elseReturn));
     }
 
+    int flagIndex = joinLeftInput.getRowType().getFieldCount() - 1;
+    RexNode flagNode = rexBuilder.makeInputRef(
+            join.getRowType().getFieldList().get(flagIndex).getType(), 
flagIndex);
+
     // 6) Build plan
-    // Split this filter condition in 
CalcitePlanner.fixUpASTAggregateIncrementalRebuild:
-    // First disjunct for update branch
-    // Second disjunct for insert branch
-    RexNode filterCond = rexBuilder.makeCall(
-            SqlStdOperatorTable.OR, flagNode, 
rexBuilder.makeCall(SqlStdOperatorTable.IS_NULL, flagNode));
-    RelNode newNode = call.builder()
+    RelNode newNode = relBuilder
         .push(join)
-        .filter(filterCond)
+        .filter(createFilterCondition(joinRightInput, flagNode, projExprs, 
relBuilder))
         .project(projExprs)
         .build();
     call.transformTo(newNode);
   }
+
+  protected abstract T createJoinRightInput(RelOptRuleCall call);
+
+  protected static class RightInput {

Review comment:
       `RightInput` -> `IncrementalComputePlan` ? That would imply a bit of 
renaming all over but I think it's more expressive name.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [hive] jcamachor commented on a change in pull request #2196: HIVE-24992: Incremental rebuild of MV having aggregate in presence of delete operation

Reply via email to