zabetak commented on code in PR #5524:
URL: https://github.com/apache/hive/pull/5524#discussion_r1822696848


##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMaxRowCount.java:
##########
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.stats;
+
+import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMdMaxRowCount;
+import org.apache.calcite.rel.metadata.RelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.util.BuiltInMethod;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+/**
+ * Extends {@link RelMdMaxRowCount} to get max row count for {@link 
HiveTableScan}
+ */
+public class HiveRelMdMaxRowCount extends RelMdMaxRowCount {

Review Comment:
   We don't need really need to extend the `RelMdMaxRowCount` handler since 
there are no real inheritance relationships between them; we are adding a 
genuinely new handler for `HiveTableScan` we are not overriding or changing the 
behavior of the existing ones. 
   
   Moreover, the `RelMdMaxRowCount` handler is already registered via the 
`DefaultRelMetadataProvider` so its gonna be used anyways for anything that is 
not a HiveTableScan.



##########
ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java:
##########
@@ -61,6 +61,7 @@
 import org.apache.hadoop.hive.ql.metadata.Partition;
 import org.apache.hadoop.hive.ql.metadata.PartitionIterable;
 import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;

Review Comment:
   To avoid coupling this class utility class with `RelOptHiveTable` maybe we 
could put this method inside `RelOptHiveTable`. 



##########
ql/src/test/queries/clientpositive/prune_empty_table.q:
##########
@@ -0,0 +1,24 @@
+set hive.tez.cartesian-product.enabled=true;
+
+create table c (a1 int) stored as orc;
+create table tmp1 (a int) stored as orc;
+create table tmp2 (a int) stored as orc;
+
+insert into table c values (3);
+insert into table tmp1 values (3);
+
+explain cbo
+with
+first as (
+select a1 from c where a1 = 3
+),

Review Comment:
   CTEs make the query more complex and add another dimension to the problem. 
If possible let's just skip it.
   
   Can't we repro the problem with just two tables that one of them is empty? 
Do we need union all + 3 tables?
   
   Since the query was failing at runtime consider adding also a full execution 
of the query with the results.



##########
ql/src/test/queries/clientpositive/prune_empty_table.q:
##########
@@ -0,0 +1,24 @@
+set hive.tez.cartesian-product.enabled=true;

Review Comment:
   Is the property necessary?



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMaxRowCount.java:
##########
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.stats;
+
+import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMdMaxRowCount;
+import org.apache.calcite.rel.metadata.RelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.util.BuiltInMethod;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+/**
+ * Extends {@link RelMdMaxRowCount} to get max row count for {@link 
HiveTableScan}
+ */
+public class HiveRelMdMaxRowCount extends RelMdMaxRowCount {
+
+  public static final RelMetadataProvider SOURCE =
+      ReflectiveRelMetadataProvider.reflectiveSource(
+          BuiltInMethod.MAX_ROW_COUNT.method, new HiveRelMdMaxRowCount()
+      );
+
+  public @Nullable Double getMaxRowCount(HiveTableScan hiveTableScan, 
RelMetadataQuery mq) {
+    RelOptHiveTable table = (RelOptHiveTable) hiveTableScan.getTable();
+    if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(
+        table.getHiveTableMD(), table.getHiveTableMD().getParameters())) {
+      return null;
+    }
+    // if basic stats are up-to-date and the table is not dummy, return 0.0D 
if table is empty
+    // super returns infinity. 
+    return !table.getName().equals(SemanticAnalyzer.DUMMY_DATABASE + "." + 
SemanticAnalyzer.DUMMY_TABLE) &&
+        StatsUtils.isTableEmpty(table) ?

Review Comment:
   If the stats are accurate why do we need this check? Can't we get directly 
the number of rows and return it?



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMaxRowCount.java:
##########
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.stats;
+
+import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMdMaxRowCount;
+import org.apache.calcite.rel.metadata.RelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.util.BuiltInMethod;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+/**
+ * Extends {@link RelMdMaxRowCount} to get max row count for {@link 
HiveTableScan}
+ */
+public class HiveRelMdMaxRowCount extends RelMdMaxRowCount {
+
+  public static final RelMetadataProvider SOURCE =
+      ReflectiveRelMetadataProvider.reflectiveSource(
+          BuiltInMethod.MAX_ROW_COUNT.method, new HiveRelMdMaxRowCount()
+      );
+
+  public @Nullable Double getMaxRowCount(HiveTableScan hiveTableScan, 
RelMetadataQuery mq) {
+    RelOptHiveTable table = (RelOptHiveTable) hiveTableScan.getTable();
+    if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(
+        table.getHiveTableMD(), table.getHiveTableMD().getParameters())) {
+      return null;
+    }
+    // if basic stats are up-to-date and the table is not dummy, return 0.0D 
if table is empty
+    // super returns infinity. 
+    return !table.getName().equals(SemanticAnalyzer.DUMMY_DATABASE + "." + 
SemanticAnalyzer.DUMMY_TABLE) &&
+        StatsUtils.isTableEmpty(table) ?
+        Double.valueOf(0.0) :
+        super.getMaxRowCount(hiveTableScan, mq);

Review Comment:
   We could possibly make this logic part of the `RelOptHiveTable`. Doing this 
may allow us to drop completely this MetadataHandler class once we upgrade to a 
more recent Calcite version with 
https://issues.apache.org/jira/browse/CALCITE-4223



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdMaxRowCount.java:
##########
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.stats;
+
+import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMdMaxRowCount;
+import org.apache.calcite.rel.metadata.RelMetadataProvider;
+import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.util.BuiltInMethod;
+import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.stats.StatsUtils;
+import org.checkerframework.checker.nullness.qual.Nullable;
+
+/**
+ * Extends {@link RelMdMaxRowCount} to get max row count for {@link 
HiveTableScan}
+ */
+public class HiveRelMdMaxRowCount extends RelMdMaxRowCount {
+
+  public static final RelMetadataProvider SOURCE =
+      ReflectiveRelMetadataProvider.reflectiveSource(
+          BuiltInMethod.MAX_ROW_COUNT.method, new HiveRelMdMaxRowCount()
+      );
+
+  public @Nullable Double getMaxRowCount(HiveTableScan hiveTableScan, 
RelMetadataQuery mq) {
+    RelOptHiveTable table = (RelOptHiveTable) hiveTableScan.getTable();
+    if (!StatsUtils.areBasicStatsUptoDateForQueryAnswering(
+        table.getHiveTableMD(), table.getHiveTableMD().getParameters())) {
+      return null;
+    }
+    // if basic stats are up-to-date and the table is not dummy, return 0.0D 
if table is empty
+    // super returns infinity. 
+    return !table.getName().equals(SemanticAnalyzer.DUMMY_DATABASE + "." + 
SemanticAnalyzer.DUMMY_TABLE) &&
+        StatsUtils.isTableEmpty(table) ?
+        Double.valueOf(0.0) :
+        super.getMaxRowCount(hiveTableScan, mq);

Review Comment:
   Probably the call to super can be replaced with a big constant or null.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org
For additional commands, e-mail: gitbox-h...@hive.apache.org

Reply via email to