[impala] 02/11: IMPALA-7165: [DOCS] Correct example for dynamic partition pruning

tarmstrong Fri, 15 Feb 2019 17:21:35 -0800

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 3cda4c2ad0b43b1a784bfc3d39977acd17540e6f
Author: Alex Rodoni <arod...@cloudera.com>
AuthorDate: Tue Jun 12 16:18:20 2018 -0700

    IMPALA-7165: [DOCS] Correct example for dynamic partition pruning
    
    Change-Id: I44d1054f55d3dc7947ccf4c2ef440e506c41f963
    Reviewed-on: http://gerrit.cloudera.org:8080/10703
    Reviewed-by: Vuk Ercegovac <vercego...@cloudera.com>
    Tested-by: Impala Public Jenkins <impala-public-jenk...@cloudera.com>
---
 docs/shared/impala_common.xml            | 72 +++++++++++++++---------------
 docs/topics/impala_partitioning.xml      | 75 +++++++++++++++++++++++---------
 docs/topics/impala_runtime_filtering.xml | 17 +++++---
 3 files changed, 101 insertions(+), 63 deletions(-)

diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 7bb187a..38a7b96 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -1798,42 +1798,44 @@ from length_demo;
       </p>
 
 <codeblock id="simple_dpp_example">
-create table yy (s string) partitioned by (year int) stored as parquet;
-insert into yy partition (year) values ('1999', 1999), ('2000', 2000),
-  ('2001', 2001), ('2010',2010);
-compute stats yy;
-
-create table yy2 (s string) partitioned by (year int) stored as parquet;
-insert into yy2 partition (year) values ('1999', 1999), ('2000', 2000),
-  ('2001', 2001);
-compute stats yy2;
-
--- The query reads an unknown number of partitions, whose key values are only
--- known at run time. The 'runtime filters' lines show how the information 
about
--- the partitions is calculated in query fragment 02, and then used in query
--- fragment 00 to decide which partitions to skip.
-explain select s from yy2 where year in (select year from yy where year 
between 2000 and 2005);
-+----------------------------------------------------------+
-| Explain String                                           |
-+----------------------------------------------------------+
-| Estimated Per-Host Requirements: Memory=16.00MB VCores=2 |
-|                                                          |
-| 04:EXCHANGE [UNPARTITIONED]                              |
-| |                                                        |
-| 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST]                 |
-| |  hash predicates: year = year                          |
-| |  <b>runtime filters: RF000 &lt;- year</b>                        |
-| |                                                        |
-| |--03:EXCHANGE [BROADCAST]                               |
-| |  |                                                     |
-| |  01:SCAN HDFS [dpp.yy]                                 |
-| |     partitions=2/4 files=2 size=468B                   |
-| |                                                        |
-| 00:SCAN HDFS [dpp.yy2]                                   |
-|    partitions=2/3 files=2 size=468B                      |
-|    <b>runtime filters: RF000 -> year</b>                        |
-+----------------------------------------------------------+
+CREATE TABLE yy (s STRING) PARTITIONED BY (year INT);
+INSERT INTO yy PARTITION (year) VALUES ('1999', 1999), ('2000', 2000),
+  ('2001', 2001), ('2010', 2010), ('2018', 2018);
+COMPUTE STATS yy;
+
+CREATE TABLE yy2 (s STRING, year INT);
+INSERT INTO yy2 VALUES ('1999', 1999), ('2000', 2000), ('2001', 2001);
+COMPUTE STATS yy2;
+
+-- The following query reads an unknown number of partitions, whose key values
+-- are only known at run time. The 'runtime filters' lines show how the
+-- information about the partitions is calculated in query fragment 02, and 
then
+-- used in query fragment 00 to decide which partitions to skip.
+
+EXPLAIN SELECT s FROM yy WHERE year IN (SELECT year FROM yy2);
++--------------------------------------------------------------------------+
+| PLAN-ROOT SINK                                                           |
+| |                                                                        |
+| 04:EXCHANGE [UNPARTITIONED]                                              |
+| |                                                                        |
+| 02:HASH JOIN [LEFT SEMI JOIN, BROADCAST]                                 |
+| |  hash predicates: year = year                                          |
+| |  <b>runtime filters: RF000 &lt;- year</b>                              |
+| |                                                                        |
+| |--03:EXCHANGE [BROADCAST]                                               |
+| |  |                                                                     |
+| |  01:SCAN HDFS [default.yy2]                                            |
+| |     partitions=1/1 files=1 size=620B                                   |
+| |                                                                        |
+| 00:SCAN HDFS [default.yy]                                                |
+|    <b>partitions=5/5</b> files=5 size=1.71KB                             |
+|    runtime filters: RF000 -> year                                        |
++--------------------------------------------------------------------------+
+
+SELECT s FROM yy WHERE year IN (SELECT year FROM yy2); -- Returns 3 rows from 
yy
+PROFILE;
 </codeblock>
+
       <p id="order_by_scratch_dir">
         By default, intermediate files used during large sort, join, 
aggregation, or analytic function operations
         are stored in the directory <filepath>/tmp/impala-scratch</filepath> . 
These files are removed when the
diff --git a/docs/topics/impala_partitioning.xml 
b/docs/topics/impala_partitioning.xml
index c2e36ed..011ff88 100644
--- a/docs/topics/impala_partitioning.xml
+++ b/docs/topics/impala_partitioning.xml
@@ -143,19 +143,27 @@ under the License.
         </li>
 
         <li>
-          <codeph><xref href="impala_alter_table.xml#alter_table">ALTER 
TABLE</xref></codeph>: you can add or drop partitions, to work with
-          different portions of a huge data set. You can designate the HDFS 
directory that holds the data files for a specific partition.
-          With data partitioned by date values, you might <q>age out</q> data 
that is no longer relevant.
-          <note 
conref="../shared/impala_common.xml#common/add_partition_set_location"/>
+          <codeph><xref href="impala_alter_table.xml#alter_table">ALTER
+              TABLE</xref></codeph>: you can add or drop partitions, to work
+          with different portions of a huge data set. You can designate the 
HDFS
+          directory that holds the data files for a specific partition. With
+          data partitioned by date values, you might <q>age out</q> data that 
is
+          no longer relevant. <note
+            
conref="../shared/impala_common.xml#common/add_partition_set_location"
+          />
         </li>
 
         <li>
-          <codeph><xref 
href="impala_insert.xml#insert">INSERT</xref></codeph>: When you insert data 
into a partitioned table, you identify
-          the partitioning columns. One or more values from each inserted row 
are not stored in data files, but instead determine the
-          directory where that row value is stored. You can also specify which 
partition to load a set of data into, with <codeph>INSERT
-          OVERWRITE</codeph> statements; you can replace the contents of a 
specific partition but you cannot append data to a specific
-          partition.
-          <p rev="1.3.1" 
conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
+          <codeph><xref href="impala_insert.xml#insert">INSERT</xref></codeph>:
+          When you insert data into a partitioned table, you identify the
+          partitioning columns. One or more values from each inserted row are
+          not stored in data files, but instead determine the directory where
+          that row value is stored. You can also specify which partition to 
load
+          a set of data into, with <codeph>INSERT OVERWRITE</codeph> 
statements;
+          you can replace the contents of a specific partition but you cannot
+          append data to a specific partition. <p rev="1.3.1"
+            
conref="../shared/impala_common.xml#common/insert_inherit_permissions"
+          />
         </li>
 
         <li>
@@ -242,7 +250,8 @@ insert into weather <b>partition (year=2014, month=04, 
day)</b> select 'sunny',2
 
     <conbody>
 
-      <p rev="1.3.1" 
conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
+      <p rev="1.3.1"
+        
conref="../shared/impala_common.xml#common/insert_inherit_permissions"/>
 
     </conbody>
 
@@ -377,7 +386,8 @@ insert into weather <b>partition (year=2014, month=04, 
day)</b> select 'sunny',2
 
         <p conref="../shared/impala_common.xml#common/partitions_and_views"/>
 
-        <p 
conref="../shared/impala_common.xml#common/analytic_partition_pruning_caveat"/>
+        <p
+          
conref="../shared/impala_common.xml#common/analytic_partition_pruning_caveat"/>
 
       </conbody>
 
@@ -408,19 +418,38 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 
2010, 2015);
 </codeblock>
 
         <p>
-          Dynamic partition pruning involves using information only available 
at run time, such as the result of a subquery:
+          Dynamic partition pruning involves using information only available
+          at run time, such as the result of a subquery. The following example
+          shows a simple dynamic partition pruning.
         </p>
 
 <codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>
 
-<!-- Former example. Not sure it really would trigger DPP. SELECT COUNT(*) 
FROM sales_table WHERE year = (SELECT MAX(year) FROM some_other_table); -->
+        <p>
+          In the above example, Impala evaluates the subquery, sends the
+          subquery results to all Impala nodes participating in the query, and
+          then each <cmdname>impalad</cmdname> daemon uses the dynamic 
partition
+          pruning optimization to read only the partitions with the relevant 
key
+          values.
+        </p>
 
         <p>
-          In this case, Impala evaluates the subquery, sends the subquery 
results to all Impala nodes participating in the query, and then
-          each <cmdname>impalad</cmdname> daemon uses the dynamic partition 
pruning optimization to read only the partitions with the
-          relevant key values.
+          The output query plan from the <codeph>EXPLAIN</codeph> statement
+          shows that runtime filters are enabled. The plan also shows that it
+          expects to read all 5 partitions of the <codeph>yy</codeph> table,
+          indicating that static partition pruning will not happen.
         </p>
 
+        <p>The Filter summary in the <codeph>PROFILE</codeph> output shows that
+          the scan node filtered out based on a runtime filter of dynamic
+          partition pruning. </p>
+
+<codeblock>Filter 0 (1.00 MB):
+ - Files processed: 3
+ - <b>Files rejected: 1 (1)</b>
+ - Files total: 3 (3)
+</codeblock>
+
         <p>
           Dynamic partition pruning is especially effective for queries 
involving joins of several large partitioned tables. Evaluating the
           <codeph>ON</codeph> clauses of the join predicates might normally 
require reading data from all partitions of certain tables. If
@@ -429,7 +458,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 
2015);
           and the amount of intermediate data stored and transmitted across 
the network during the query.
         </p>
 
-        <p 
conref="../shared/impala_common.xml#common/spill_to_disk_vs_dynamic_partition_pruning"/>
+        <p
+          
conref="../shared/impala_common.xml#common/spill_to_disk_vs_dynamic_partition_pruning"/>
 
         <p>
           Dynamic partition pruning is part of the runtime filtering feature, 
which applies to other kinds of queries in addition to queries
@@ -479,7 +509,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 
2015);
         </li>
 
         <li>
-          <p 
conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
+          <p
+            
conref="../shared/impala_common.xml#common/complex_types_partitioning"/>
         </li>
 
         <li>
@@ -559,7 +590,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 
2015);
         formats.
       </p>
 
-      <note 
conref="../shared/impala_common.xml#common/add_partition_set_location"/>
+      <note
+        
conref="../shared/impala_common.xml#common/add_partition_set_location"/>
 
       <p>
         What happens to the data files when a partition is dropped depends on 
whether the partitioned table is designated as internal or
@@ -615,7 +647,8 @@ SELECT COUNT(*) FROM sales_table WHERE year IN (2005, 2010, 
2015);
 
       <note type="important">
         <p conref="../shared/impala_common.xml#common/cs_or_cis"/>
-        <p 
conref="../shared/impala_common.xml#common/incremental_stats_after_full"/>
+        <p
+          
conref="../shared/impala_common.xml#common/incremental_stats_after_full"/>
         <p 
conref="../shared/impala_common.xml#common/incremental_stats_caveats"/>
       </note>
 
diff --git a/docs/topics/impala_runtime_filtering.xml 
b/docs/topics/impala_runtime_filtering.xml
index 05c3743..cce9155 100644
--- a/docs/topics/impala_runtime_filtering.xml
+++ b/docs/topics/impala_runtime_filtering.xml
@@ -344,18 +344,21 @@ under the License.
       </p>
 
       <p>
-        The following example shows a query that uses a single runtime filter 
(labelled <codeph>RF00</codeph>)
-        to prune the partitions that are scanned in one stage of the query, 
based on evaluating the
-        result set of a subquery:
+        The following example shows a query that uses a single runtime filter,
+        labeled <codeph>RF000</codeph>, to prune the partitions that are 
scanned
+        in one stage of the query, based on evaluating the result set of a
+        subquery:
       </p>
 
 <codeblock conref="../shared/impala_common.xml#common/simple_dpp_example"/>
 
       <p>
-        The query profile (displayed by the <codeph>PROFILE</codeph> command 
in <cmdname>impala-shell</cmdname>)
-        contains both the <codeph>EXPLAIN</codeph> plan and more detailed 
information about the internal
-        workings of the query. The profile output includes a section labelled 
the <q>filter routing table</q>,
-        with information about each filter based on its ID.
+        The query profile (displayed by the <codeph>PROFILE</codeph> command
+        in <cmdname>impala-shell</cmdname>) contains both the
+          <codeph>EXPLAIN</codeph> plan and more detailed information about the
+        internal workings of the query. The profile output includes the
+          <codeph>Filter routing table</codeph> section with information about
+        each filter based on its ID.
       </p>
     </conbody>
   </concept>

[impala] 02/11: IMPALA-7165: [DOCS] Correct example for dynamic partition pruning

Reply via email to