[drill] 01/03: Add docs for Drill Metastore

bridgetb Tue, 04 Feb 2020 11:42:14 -0800

This is an automated email from the ASF dual-hosted git repository.

bridgetb pushed a commit to branch gh-pages
in repository https://gitbox.apache.org/repos/asf/drill.git


commit fd42f80df2fab5a1a23a88ca96e3d59026091584
Author: Volodymyr Vysotskyi <vvo...@gmail.com>
AuthorDate: Mon Jan 13 16:43:10 2020 +0200

    Add docs for Drill Metastore
---
 _data/docs.json                                    | 156 +++++++++++++++-----
 _docs/getting-started/010-drill-introduction.md    |   4 +-
 _docs/rn/005-1.16.0-rn.md                          |   4 +-
 .../007-analyze-table-refresh-metadata.md          | 158 +++++++++++++++++++++
 ....md => 009-analyze-table-compute-statistics.md} |  81 +++++------
 5 files changed, 315 insertions(+), 88 deletions(-)

diff --git a/_data/docs.json b/_data/docs.json
index b8f927f..f74ef35 100644
--- a/_data/docs.json
+++ b/_data/docs.json
@@ -21,7 +21,7 @@
             "title": "ALTER SYSTEM", 
             "url": "/docs/alter-system/"
         }, 
-        "ANALYZE TABLE": {
+        "ANALYZE TABLE COMPUTE STATISTICS": {
             "breadcrumbs": [
                 {
                     "title": "SQL Commands", 
@@ -36,11 +36,32 @@
             "next_title": "SET", 
             "next_url": "/docs/set/", 
             "parent": "SQL Commands", 
+            "previous_title": "ANALYZE TABLE REFRESH METADATA", 
+            "previous_url": "/docs/analyze-table-refresh-metadata/", 
+            "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", 
+            "title": "ANALYZE TABLE COMPUTE STATISTICS", 
+            "url": "/docs/analyze-table-compute-statistics/"
+        }, 
+        "ANALYZE TABLE REFRESH METADATA": {
+            "breadcrumbs": [
+                {
+                    "title": "SQL Commands", 
+                    "url": "/docs/sql-commands/"
+                }, 
+                {
+                    "title": "SQL Reference", 
+                    "url": "/docs/sql-reference/"
+                }
+            ], 
+            "children": [], 
+            "next_title": "ANALYZE TABLE COMPUTE STATISTICS", 
+            "next_url": "/docs/analyze-table-compute-statistics/", 
+            "parent": "SQL Commands", 
             "previous_title": "Supported SQL Commands", 
             "previous_url": "/docs/supported-sql-commands/", 
-            "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table.md", 
-            "title": "ANALYZE TABLE", 
-            "url": "/docs/analyze-table/"
+            "relative_path": 
"_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", 
+            "title": "ANALYZE TABLE REFRESH METADATA", 
+            "url": "/docs/analyze-table-refresh-metadata/"
         }, 
         "About SQL Function Examples": {
             "breadcrumbs": [
@@ -11315,8 +11336,8 @@
             "next_title": "REFRESH TABLE METADATA", 
             "next_url": "/docs/refresh-table-metadata/", 
             "parent": "SQL Commands", 
-            "previous_title": "ANALYZE TABLE", 
-            "previous_url": "/docs/analyze-table/", 
+            "previous_title": "ANALYZE TABLE COMPUTE STATISTICS", 
+            "previous_url": "/docs/analyze-table-compute-statistics/", 
             "relative_path": "_docs/sql-reference/sql-commands/010-set.md", 
             "title": "SET", 
             "url": "/docs/set/"
@@ -11404,8 +11425,8 @@
                         }
                     ], 
                     "children": [], 
-                    "next_title": "ANALYZE TABLE", 
-                    "next_url": "/docs/analyze-table/", 
+                    "next_title": "ANALYZE TABLE REFRESH METADATA", 
+                    "next_url": "/docs/analyze-table-refresh-metadata/", 
                     "parent": "SQL Commands", 
                     "previous_title": "SQL Commands", 
                     "previous_url": "/docs/sql-commands/", 
@@ -11425,14 +11446,35 @@
                         }
                     ], 
                     "children": [], 
-                    "next_title": "SET", 
-                    "next_url": "/docs/set/", 
+                    "next_title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                    "next_url": "/docs/analyze-table-compute-statistics/", 
                     "parent": "SQL Commands", 
                     "previous_title": "Supported SQL Commands", 
                     "previous_url": "/docs/supported-sql-commands/", 
-                    "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table.md", 
-                    "title": "ANALYZE TABLE", 
-                    "url": "/docs/analyze-table/"
+                    "relative_path": 
"_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", 
+                    "title": "ANALYZE TABLE REFRESH METADATA", 
+                    "url": "/docs/analyze-table-refresh-metadata/"
+                }, 
+                {
+                    "breadcrumbs": [
+                        {
+                            "title": "SQL Commands", 
+                            "url": "/docs/sql-commands/"
+                        }, 
+                        {
+                            "title": "SQL Reference", 
+                            "url": "/docs/sql-reference/"
+                        }
+                    ], 
+                    "children": [], 
+                    "next_title": "SET", 
+                    "next_url": "/docs/set/", 
+                    "parent": "SQL Commands", 
+                    "previous_title": "ANALYZE TABLE REFRESH METADATA", 
+                    "previous_url": "/docs/analyze-table-refresh-metadata/", 
+                    "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", 
+                    "title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                    "url": "/docs/analyze-table-compute-statistics/"
                 }, 
                 {
                     "breadcrumbs": [
@@ -11449,8 +11491,8 @@
                     "next_title": "REFRESH TABLE METADATA", 
                     "next_url": "/docs/refresh-table-metadata/", 
                     "parent": "SQL Commands", 
-                    "previous_title": "ANALYZE TABLE", 
-                    "previous_url": "/docs/analyze-table/", 
+                    "previous_title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                    "previous_url": "/docs/analyze-table-compute-statistics/", 
                     "relative_path": 
"_docs/sql-reference/sql-commands/010-set.md", 
                     "title": "SET", 
                     "url": "/docs/set/"
@@ -13067,8 +13109,8 @@
                                 }
                             ], 
                             "children": [], 
-                            "next_title": "ANALYZE TABLE", 
-                            "next_url": "/docs/analyze-table/", 
+                            "next_title": "ANALYZE TABLE REFRESH METADATA", 
+                            "next_url": 
"/docs/analyze-table-refresh-metadata/", 
                             "parent": "SQL Commands", 
                             "previous_title": "SQL Commands", 
                             "previous_url": "/docs/sql-commands/", 
@@ -13088,14 +13130,35 @@
                                 }
                             ], 
                             "children": [], 
-                            "next_title": "SET", 
-                            "next_url": "/docs/set/", 
+                            "next_title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                            "next_url": 
"/docs/analyze-table-compute-statistics/", 
                             "parent": "SQL Commands", 
                             "previous_title": "Supported SQL Commands", 
                             "previous_url": "/docs/supported-sql-commands/", 
-                            "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table.md", 
-                            "title": "ANALYZE TABLE", 
-                            "url": "/docs/analyze-table/"
+                            "relative_path": 
"_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", 
+                            "title": "ANALYZE TABLE REFRESH METADATA", 
+                            "url": "/docs/analyze-table-refresh-metadata/"
+                        }, 
+                        {
+                            "breadcrumbs": [
+                                {
+                                    "title": "SQL Commands", 
+                                    "url": "/docs/sql-commands/"
+                                }, 
+                                {
+                                    "title": "SQL Reference", 
+                                    "url": "/docs/sql-reference/"
+                                }
+                            ], 
+                            "children": [], 
+                            "next_title": "SET", 
+                            "next_url": "/docs/set/", 
+                            "parent": "SQL Commands", 
+                            "previous_title": "ANALYZE TABLE REFRESH 
METADATA", 
+                            "previous_url": 
"/docs/analyze-table-refresh-metadata/", 
+                            "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", 
+                            "title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                            "url": "/docs/analyze-table-compute-statistics/"
                         }, 
                         {
                             "breadcrumbs": [
@@ -13112,8 +13175,8 @@
                             "next_title": "REFRESH TABLE METADATA", 
                             "next_url": "/docs/refresh-table-metadata/", 
                             "parent": "SQL Commands", 
-                            "previous_title": "ANALYZE TABLE", 
-                            "previous_url": "/docs/analyze-table/", 
+                            "previous_title": "ANALYZE TABLE COMPUTE 
STATISTICS", 
+                            "previous_url": 
"/docs/analyze-table-compute-statistics/", 
                             "relative_path": 
"_docs/sql-reference/sql-commands/010-set.md", 
                             "title": "SET", 
                             "url": "/docs/set/"
@@ -14900,8 +14963,8 @@
                 }
             ], 
             "children": [], 
-            "next_title": "ANALYZE TABLE", 
-            "next_url": "/docs/analyze-table/", 
+            "next_title": "ANALYZE TABLE REFRESH METADATA", 
+            "next_url": "/docs/analyze-table-refresh-metadata/", 
             "parent": "SQL Commands", 
             "previous_title": "SQL Commands", 
             "previous_url": "/docs/sql-commands/", 
@@ -20703,8 +20766,8 @@
                                 }
                             ], 
                             "children": [], 
-                            "next_title": "ANALYZE TABLE", 
-                            "next_url": "/docs/analyze-table/", 
+                            "next_title": "ANALYZE TABLE REFRESH METADATA", 
+                            "next_url": 
"/docs/analyze-table-refresh-metadata/", 
                             "parent": "SQL Commands", 
                             "previous_title": "SQL Commands", 
                             "previous_url": "/docs/sql-commands/", 
@@ -20724,14 +20787,35 @@
                                 }
                             ], 
                             "children": [], 
-                            "next_title": "SET", 
-                            "next_url": "/docs/set/", 
+                            "next_title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                            "next_url": 
"/docs/analyze-table-compute-statistics/", 
                             "parent": "SQL Commands", 
                             "previous_title": "Supported SQL Commands", 
                             "previous_url": "/docs/supported-sql-commands/", 
-                            "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table.md", 
-                            "title": "ANALYZE TABLE", 
-                            "url": "/docs/analyze-table/"
+                            "relative_path": 
"_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", 
+                            "title": "ANALYZE TABLE REFRESH METADATA", 
+                            "url": "/docs/analyze-table-refresh-metadata/"
+                        }, 
+                        {
+                            "breadcrumbs": [
+                                {
+                                    "title": "SQL Commands", 
+                                    "url": "/docs/sql-commands/"
+                                }, 
+                                {
+                                    "title": "SQL Reference", 
+                                    "url": "/docs/sql-reference/"
+                                }
+                            ], 
+                            "children": [], 
+                            "next_title": "SET", 
+                            "next_url": "/docs/set/", 
+                            "parent": "SQL Commands", 
+                            "previous_title": "ANALYZE TABLE REFRESH 
METADATA", 
+                            "previous_url": 
"/docs/analyze-table-refresh-metadata/", 
+                            "relative_path": 
"_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", 
+                            "title": "ANALYZE TABLE COMPUTE STATISTICS", 
+                            "url": "/docs/analyze-table-compute-statistics/"
                         }, 
                         {
                             "breadcrumbs": [
@@ -20748,8 +20832,8 @@
                             "next_title": "REFRESH TABLE METADATA", 
                             "next_url": "/docs/refresh-table-metadata/", 
                             "parent": "SQL Commands", 
-                            "previous_title": "ANALYZE TABLE", 
-                            "previous_url": "/docs/analyze-table/", 
+                            "previous_title": "ANALYZE TABLE COMPUTE 
STATISTICS", 
+                            "previous_url": 
"/docs/analyze-table-compute-statistics/", 
                             "relative_path": 
"_docs/sql-reference/sql-commands/010-set.md", 
                             "title": "SET", 
                             "url": "/docs/set/"
@@ -22720,4 +22804,4 @@
             "url": "/docs/ecosystem/"
         }
     ]
-}
\ No newline at end of file
+}
diff --git a/_docs/getting-started/010-drill-introduction.md 
b/_docs/getting-started/010-drill-introduction.md
index 7035a98..5ab61aa 100644
--- a/_docs/getting-started/010-drill-introduction.md
+++ b/_docs/getting-started/010-drill-introduction.md
@@ -1,6 +1,6 @@
 ---
 title: "Drill Introduction"
-date: 2019-12-26
+date: 2020-01-13
 parent: "Getting Started"
 ---
 Drill is an Apache open-source SQL query engine for Big Data exploration.
@@ -46,7 +46,7 @@ Metastore support:
 
 
 ## What's New in Apache Drill 1.16  
-- [ANALYZE TABLE statement]({{site.baseurl}}/docs/analyze-table/) to computes 
statistics on Parquet data 
([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328))   
+- [ANALYZE TABLE 
statement]({{site.baseurl}}/docs/analyze-table-compute-statistics/) to computes 
statistics on Parquet data 
([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328))   
 - [CREATE OR REPLACE SCHEMA 
command]({{site.baseurl}}/docs/create-or-replace-schema/) to define a schema 
for text files ([DRILL-6964](https://issues.apache.org/jira/browse/DRILL-6964)) 
  
 - [REFRESH TABLE METADATA 
command]({{site.baseurl}}/docs/refresh-table-metadata/) can generate metadata 
cache files for specific columns 
([DRILL-7058](https://issues.apache.org/jira/browse/DRILL-7058))  
 - [SYSLOG (RFC-5424) Format 
Plugin]({{site.baseurl}}/docs/syslog-format-plugin/) 
([DRILL-6582](https://issues.apache.org/jira/browse/DRILL-6582))
diff --git a/_docs/rn/005-1.16.0-rn.md b/_docs/rn/005-1.16.0-rn.md
index ac07ab7..88fabd7 100644
--- a/_docs/rn/005-1.16.0-rn.md
+++ b/_docs/rn/005-1.16.0-rn.md
@@ -12,7 +12,7 @@ Today, we're happy to announce the availability of Drill 
1.16.0. You can downloa
 This release of Drill provides the following new features and improvements:  
 
 
-- [ANALYZE TABLE statement]({{site.baseurl}}/docs/analyze-table/) to computes 
statistics on Parquet data 
([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328))   
+- [ANALYZE TABLE 
statement]({{site.baseurl}}/docs/analyze-table-compute-statistics/) to computes 
statistics on Parquet data 
([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328))   
 - [CREATE OR REPLACE SCHEMA 
command]({{site.baseurl}}/docs/create-or-replace-schema/) to define a schema 
for text files ([DRILL-6964](https://issues.apache.org/jira/browse/DRILL-6964)) 
  
 - [REFRESH TABLE METADATA 
command]({{site.baseurl}}/docs/refresh-table-metadata/) can generate metadata 
cache files for specific columns 
([DRILL-7058](https://issues.apache.org/jira/browse/DRILL-7058))  
 - [SYSLOG (RFC-5424) Format 
Plugin]({{site.baseurl}}/docs/syslog-format-plugin/) 
([DRILL-6582](https://issues.apache.org/jira/browse/DRILL-6582))
@@ -498,4 +498,4 @@ The following sections provide a complete list of all the 
fixes and improvements
 </li>
 </ul>   
     
-                                                                               
 
\ No newline at end of file
+                                                                               
 
diff --git 
a/_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md 
b/_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md
new file mode 100644
index 0000000..21fa711
--- /dev/null
+++ b/_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md
@@ -0,0 +1,158 @@
+---
+title: "ANALYZE TABLE REFRESH METADATA"
+parent: "SQL Commands"
+date: 2020-01-13
+---
+
+Starting from Drill 1.17, you can store table metadata (including schema and 
computed statistics) into Drill Metastore.
+This metadata will be used when querying a table for more optimal plan 
creation.
+
+{% include startnote.html %}In Drill 1.17, this feature is supported for 
Parquet tables only and is disabled by default.{% include endnote.html %}
+
+To enable Drill Metastore usage, the following option `metastore.enabled` 
should be set to `true`, as shown:
+
+       SET `metastore.enabled` = true;
+
+Alternatively, you can enable the option in the Drill Web UI at 
`http://<drill-hostname-or-ip-address>:8047/options`.
+
+## Syntax
+
+The ANALYZE TABLE REFRESH METADATA statement supports the following syntax:
+
+       ANALYZE TABLE [table_name] [COLUMNS {(col1, col2, ...) | NONE}]
+       REFRESH METADATA ['level' LEVEL]
+       [{COMPUTE | ESTIMATE} | STATISTICS [(column1, column2, ...)]
+       [ SAMPLE number PERCENT ]]
+
+## Parameters
+
+*table_name*
+The name of the table or directory for which Drill will collect table 
metadata. If the table does not exist, or the table
+ is temporary, the command fails and metadata is not collected and stored.
+
+*COLUMNS (col1, col2, ...)*
+Optional names of the column(s) for which Drill will generate and store 
metadata. the Stored schema will include all table columns.
+
+*COLUMNS NONE*
+Specifies to ignore collecting and storing metadata for all table columns.
+
+*level*
+Optional varchar literal which specifies maximum level depth for collecting 
metadata.
+Possible values: `TABLE`, `SEGMENT`, `PARTITION`, `FILE`, `ROW_GROUP`, `ALL`. 
Default is `ALL`.
+
+*COMPUTE*
+Generates statistics for the table to be stored into the Metastore.
+If statistics usage is disabled (`planner.enable_statistics` is set to 
`false`), an error will be thrown when this clause is specified.
+
+*ESTIMATE*
+Generates estimated statistics for the table to be stored into the Metastore. 
Currently is not supported.
+
+*(column1, column2, ...)*
+The name of the column(s) for which Drill will generate statistics.
+
+*SAMPLE*
+Optional. Indicates that compute statistics should run on a subset of the data.
+
+*number PERCENT*  
+An integer that specifies the percentage of data on which to compute 
statistics. For example, if a table has 100 rows, `SAMPLE 50 PERCENT` indicates 
that statistics should be computed on 50 rows. The optimizer selects the rows 
at random. 
+
+## Related Options
+
+- **metastore.enabled**
+Enables Drill Metastore usage to be able to store table metadata during 
ANALYZE TABLE commands execution and to be able
+ to read table metadata during regular queries execution or when querying some 
INFORMATION_SCHEMA tables. Default is `false`.
+- **metastore.metadata.store.depth_level**
+Specifies maximum level depth for collecting metadata. Default is `'ALL'`.
+- **metastore.retrieval.retry_attempts**
+Specifies the number of attempts for retrying query planning after detecting 
that query metadata is changed.
+If the number of retries was exceeded, query will be planned without metadata 
information from the Metastore. Default is 5.
+- **metastore.metadata.fallback_to_file_metadata**
+Allows using file metadata cache for the case when required metadata is absent 
in the Metastore. Default is true.
+- **metastore.metadata.use_schema**
+Enables schema usage, stored to the Metastore. Default is `true`.
+- **metastore.metadata.use_statistics**
+Enables statistics usage, stored in the Metastore, at the planning stage. 
Default is `true`.
+- **metastore.metadata.ctas.auto-collect**
+Specifies whether schema and / or column statistics will be automatically 
collected for every table after CTAS and CTTAS.
+This option is not active for now. Default is `'NONE'`.
+- **drill.exec.storage.implicit.last_modified_time.column.label**
+Sets the implicit column name for the last modified time (`lmt`) column. For 
internal usage when producing Metastore analyze.
+- **drill.exec.storage.implicit.row_group_index.column.label**
+Sets the implicit column name for the row group index (`rgi`) column. For 
internal usage when producing Metastore analyze.
+- **drill.exec.storage.implicit.row_group_length.column.label**
+Sets the implicit column name for the row group length (`rgl`) column. For 
internal usage when producing Metastore analyze.
+- **drill.exec.storage.implicit.row_group_start.column.label**
+Sets the implicit column name for the row group start (`rgs`) column. For 
internal usage when producing Metastore analyze.
+
+## Related Commands
+
+To drop table metadata from the Metastore, the following command may be used:
+
+       ANALYZE TABLE [table_name] DROP [METADATA|STATISTICS] [IF EXISTS]
+
+It will not throw an exception for absent table metadata if `IF EXISTS` clause 
was specified.
+
+       ANALYZE TABLE [workspace.]table_name COMPUTE STATISTICS [(column1, 
column2,...)] [SAMPLE number PERCENT]
+       
+See [ANALYZE TABLE COMPUTE 
STATISTICS]({{site.baseurl}}/docs/analyze-table-compute-statistics). 
+
+## Usage Notes
+
+### General Information
+
+- Currently `ANALYZE TABLE REFRESH METADATA` statement can compute and store 
metadata only for Parquet tables within `dfs` storage plugins.
+- For the case when `ANALYZE TABLE REFRESH METADATA` command is executed for 
the first time, whole table metadata will be collected and stored into 
Metastore.
+If analyze was already executed for the table, and table data wasn't changed, 
all further analyze commands wouldn't trigger table analyzing and message that 
table metadata is up to date will be returned.
+
+### Incremental analyze
+
+For the case when some table data was updated, Drill will try to execute 
incremental analyze - calculate metadata only for updated data and reuse 
required metadata from the Metastore.
+
+Incremental analyze wouldn't be produced for the following cases:
+- list of interesting columns specified in analyze is not a subset of 
interesting columns from the previous analyze;
+- specified metadata level differs from the metadata level in previous analyze.
+
+### Metadata usage
+
+Drill provides the ability to use metadata obtained from the Metastore at the 
planning stage to prune segments, files
+ and row groups.
+
+Tables metadata from the Metastore is exposed to `INFORMATION_SCHEMA` tables 
(if Metastore usage is enabled).
+
+The following tables are populated with table metadata from the Metastore:
+
+`TABLES` table has the following additional columns populated from the 
Metastore:
+
+- `TABLE_SOURCE` - table data type: `PARQUET`, `CSV`, `JSON`
+- `LOCATION` - table location: `/tmp/nation`
+- `NUM_ROWS` - number of rows in a table if known, `null` if not known
+- `LAST_MODIFIED_TIME` - table's last modification time
+
+`COLUMNS` table has the following additional columns populated from the 
Metastore:
+
+- `COLUMN_DEFAULT` - column default value
+- `COLUMN_FORMAT` - usually applicable for date time columns: `yyyy-MM-dd`
+- `NUM_NULLS` - number of nulls in column values
+- `MIN_VAL` - column min value in String representation: `aaa`
+- `MAX_VAL` - column max value in String representation: `zzz`
+- `NDV` - number of distinct values in column, expressed in Double
+- `EST_NUM_NON_NULLS` - estimated number of non null values, expressed in 
Double
+- `IS_NESTED` - if column is nested. Nested columns are extracted from columns 
with struct type.
+
+`PARTITIONS` table has the following additional columns populated from the 
Metastore:
+
+- `TABLE_CATALOG` - table catalog (currently we have only one catalog): `DRILL`
+- `TABLE_SCHEMA` - table schema: `dfs.tmp`
+- `TABLE_NAME` - table name: `nation`
+- `METADATA_KEY` - top level segment key, the same for all nested segments and 
partitions: `part_int=3`
+- `METADATA_TYPE` - `SEGMENT` or `PARTITION`
+- `METADATA_IDENTIFIER` - current metadata identifier: 
`part_int=3/part_varchar=g`
+- `PARTITION_COLUMN` - partition column name: `part_varchar`
+- `PARTITION_VALUE` - partition column value: `g`
+- `LOCATION` - segment location, `null` for partitions: 
`/tmp/nation/part_int=3`
+- `LAST_MODIFIED_TIME` - last modification time
+
+## Limitations
+
+This feature is currently in the alpha phase (preview, experimental) for Drill 
1.17 and only applies to Parquet
+ tables in this release. You must enable this feature through the 
`metastore.enabled` system/session option.
diff --git a/_docs/sql-reference/sql-commands/009-analyze-table.md 
b/_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md
similarity index 75%
rename from _docs/sql-reference/sql-commands/009-analyze-table.md
rename to 
_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md
index 9251d0e..a642a40 100644
--- a/_docs/sql-reference/sql-commands/009-analyze-table.md
+++ b/_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md
@@ -1,11 +1,11 @@
 ---
-title: "ANALYZE TABLE"
+title: "ANALYZE TABLE COMPUTE STATISTICS"
 date: 2019-05-31
 parent: "SQL Commands"
 ---  
 
-Drill 1.16 and later supports the ANALYZE TABLE statement. The ANALYZE TABLE 
statement computes statistics on Parquet data stored in tables and directories. 
The optimizer in Drill uses statistics to estimate filter, aggregation, and 
join cardinalities and create an optimal query plan. 
-ANALYZE TABLE writes statistics to a JSON file in the `.stats.drill` 
directory, for example `/user/table1/.stats.drill/0_0.json`. 
+Drill 1.16 and later supports the ANALYZE TABLE COMPUTE STATISTICS statement. 
The ANALYZE TABLE COMPUTE STATISTICS statement computes statistics on Parquet 
data stored in tables and directories. The optimizer in Drill uses statistics 
to estimate filter, aggregation, and join cardinalities and create an optimal 
query plan. 
+ANALYZE TABLE COMPUTE STATISTICS writes statistics to a JSON file in the 
`.stats.drill` directory, for example `/user/table1/.stats.drill/0_0.json`. 
 
 Drill will not use the statistics for query planning unless you enable the 
`planner.statistics.use` option, as shown:
 
@@ -15,7 +15,7 @@ Alternatively, you can enable the option in the Drill Web UI 
at `http://<drill-h
 
 ## Syntax
 
-The ANALYZE TABLE statement supports the following syntax:  
+The ANALYZE TABLE COMPUTE STATISTICS statement supports the following syntax:  
 
        ANALYZE TABLE [workspace.]table_name COMPUTE STATISTICS [(column1, 
column2,...)] [SAMPLE number PERCENT]
         
@@ -42,7 +42,7 @@ An integer that specifies the percentage of data on which to 
compute statistics.
 
 ## Related Command  
 
-If you drop a table that you have already run ANALYZE TABLE against, the 
statistics are automatically removed with the table:  
+If you drop a table that you have already run ANALYZE TABLE COMPUTE STATISTICS 
against, the statistics are automatically removed with the table:  
 
        DROP TABLE [IF EXISTS] [workspace.]name  
 
@@ -50,7 +50,7 @@ To remove statistics for a table you want to keep, you must 
remove the directory
 
        DROP TABLE [IF EXISTS] [workspace.]name/.stats.drill  
 
-If you have already issued the ANALYZE TABLE statement against specific 
columns, table, or directory, you must run the DROP TABLE statement with 
`/.stats.drill` before you can successfully run the ANALYZE TABLE statement 
against the data source again, for example:
+If you have already issued the ANALYZE TABLE COMPUTE STATISTICS statement 
against specific columns, table, or directory, you must run the DROP TABLE 
statement with `/.stats.drill` before you can successfully run the ANALYZE 
TABLE COMPUTE STATISTICS statement against the data source again, for example:
 
        DROP TABLE `table_stats/Tpch0.01/parquet/customer/.stats.drill`;
 
@@ -60,30 +60,30 @@ Note that `/.stats.drill` is the directory to which the 
JSON file with statistic
 ## Usage Notes  
 
 
-- The ANALYZE TABLE statement can compute statistics for Parquet data stored 
in tables, columns, and directories within dfs storage plugins only.  
-- The user running the ANALYZE TABLE statement must have read and write 
permissions on the data source.  
+- The ANALYZE TABLE COMPUTE STATISTICS statement can compute statistics for 
Parquet data stored in tables, columns, and directories within dfs storage 
plugins only.  
+- The user running the ANALYZE TABLE COMPUTE STATISTICS statement must have 
read and write permissions on the data source.  
 - The optimizer in Drill computes the following types of statistics for each 
column:  
        - Rowcount (total number of entries in the table)  
        - Nonnullrowcount (total number of non-null entries in the table)  
        - NDV (total distinct values in the table)  
        - Avgwidth (average width, in bytes, of a column)  
        - Majortype (data type and data mode (OPTIONAL, REQUIRED, REPEATED) of 
the column values)  
-       - Histogram (represents the frequency distribution of values (numeric 
data) in a column) See 
[Histograms]({{site.baseurl}}/docs/analyze-table/#histograms).  
+       - Histogram (represents the frequency distribution of values (numeric 
data) in a column) See 
[Histograms]({{site.baseurl}}/docs/analyze-table-compute-statistics/#histograms).
  
        - When you look at the statistics file, statistics for each column 
display in the following format (c_nationkey is used as an example column):  
        
                        
{"column":"`c_nationkey`","majortype":{"type":"INT","mode":"REQUIRED"},"schema":1.0,"rowcount":1500.0,"nonnullrowcount":1500.0,"ndv":25,"avgwidth":4.0,"histogram":{"category":"numeric-equi-depth","numRowsPerBucket":150,"buckets":[0.0,2.0,4.0,7.0,9.0,12.0,15.199999999999978,17.0,19.0,22.0,24.0]}}
  
 
-- ANALYZE TABLE can compute statistics on nested scalar columns; however, you 
must explicitly state the columns, for example:    
+- ANALYZE TABLE COMPUTE STATISTICS can compute statistics on nested scalar 
columns; however, you must explicitly state the columns, for example:    
                 `ANALYZE TABLE employee_table COMPUTE STATISTICS 
(name.firstname, name.lastname);`  
-- ANALYZE TABLE can compute statistics at the root directory level, but not at 
the partition level. 
+- ANALYZE TABLE COMPUTE STATISTICS can compute statistics at the root 
directory level, but not at the partition level. 
 Drill does not compute statistics for complex types (maps, arrays).
  
 
 ## Related Options
-You can set the following options related to the ANALYZE TABLE statement at 
the system or session level with the SET (session level) or ALTER SYSTEM SET 
(system level) statements, or through the Drill Web UI at 
`http://<drill-hostname-or-ip>:8047/options`:  
+You can set the following options related to the ANALYZE TABLE COMPUTE 
STATISTICS statement at the system or session level with the SET (session 
level) or ALTER SYSTEM SET (system level) statements, or through the Drill Web 
UI at `http://<drill-hostname-or-ip>:8047/options`:  
 
 - **planner.statistics.use**  
-Enables the query planner to use statistics. When disabled, ANALYZE TABLE 
generates statistics, but the query planner will not use the statistics unless 
this option is enabled. Disabled (false) by default. 
+Enables the query planner to use statistics. When disabled, ANALYZE TABLE 
COMPUTE STATISTICS generates statistics, but the query planner will not use the 
statistics unless this option is enabled. Disabled (false) by default. 
 - **exec.statistics.ndv_accuracy**  
 Controls the trade-off between NDV statistic computation memory cost and 
accuracy. Controls the amount of memory for estimates. More memory produces 
more accurate estimates. The default value should suffice for most scenarios. 
Default is 20. Range is 0- 30.  
 - **exec.statistics.ndv_extrapolation_bf_elements**  
@@ -99,7 +99,7 @@ Controls the 'compression' factor for the TDigest algorithm 
used for histogram s
 
 ## Reserved Keywords
 
-The ANALYZE TABLE statement introduces the following reserved keywords:  
+The ANALYZE TABLE COMPUTE STATISTICS statement introduces the following 
reserved keywords:  
  
        Analyze  
        Compute  
@@ -112,11 +112,11 @@ If you use any of these words in a Drill query, you must 
enclose the word in bac
        SELECT * FROM `estimate`;
 
  
-## ANALYZE TABLE Performance
+## ANALYZE TABLE COMPUTE STATISTICS Performance
 
-- After you run the ANALYZE TABLE statement, you can view the profile for 
ANALYZE in the Drill Web UI. Go to 
`http://<drill-hostname-or-ip>:8047/profiles`, and click the ANALYZE TABLE 
statement for which you want to view the profile.  
+- After you run the ANALYZE TABLE COMPUTE STATISTICS statement, you can view 
the profile for ANALYZE in the Drill Web UI. Go to 
`http://<drill-hostname-or-ip>:8047/profiles`, and click the ANALYZE TABLE 
statement for which you want to view the profile.  
 - Should you notice any performance issues, you may want to decrease the value 
of the `planner.slice_target` option.   
-- Generating statistics on large data sets can consume time and resources, 
such as memory and CPU. ANALYZE TABLE can compute statistics on a sample 
(subset of the data indicated as a percentage) to limit the amount of resources 
needed for computation. Drill still scans the entire data set, but only 
computes on the rows selected for sampling. Rows are randomly selected for the 
sample. Note that the quality of statistics increases with the sample size.    
+- Generating statistics on large data sets can consume time and resources, 
such as memory and CPU. ANALYZE TABLE COMPUTE STATISTICS can compute statistics 
on a sample (subset of the data indicated as a percentage) to limit the amount 
of resources needed for computation. Drill still scans the entire data set, but 
only computes on the rows selected for sampling. Rows are randomly selected for 
the sample. Note that the quality of statistics increases with the sample size. 
   
  
 ## Queries that Benefit from Statistics
 Typically, the types of queries that benefit from statistics are those that 
include:
@@ -132,7 +132,7 @@ Typically, the types of queries that benefit from 
statistics are those that incl
  
 Histograms show the distribution of data to determine if data is skewed or 
normally distributed. Histogram statistics improve the selectivity estimates 
used by the optimizer to create the most efficient query plans possible. 
Histogram statistics are useful for range predicates to help determine how many 
rows belong to a particular range.   
  
-Running the ANALYZE TABLE statement generates equi-depth histogram statistics 
on each column in a table. Equi-depth histograms distribute distinct column 
values across buckets of varying widths, with all buckets having approximately 
the same number of rows. The fixed number of rows per bucket is predetermined 
by `ceil(number_rows/n)`, where `n` is the number of buckets. The number of 
distinct values in each bucket depends on the distribution of the values in a 
column. Equi-depth histogra [...]
+Running the ANALYZE TABLE COMPUTE STATISTICS statement generates equi-depth 
histogram statistics on each column in a table. Equi-depth histograms 
distribute distinct column values across buckets of varying widths, with all 
buckets having approximately the same number of rows. The fixed number of rows 
per bucket is predetermined by `ceil(number_rows/n)`, where `n` is the number 
of buckets. The number of distinct values in each bucket depends on the 
distribution of the values in a column.  [...]
  
 The following diagram shows the column values on the horizontal axis and the 
individual frequencies (dark blue) and total frequency of a bucket (light 
blue). In this example, the total number of rows = 64, hence the number of rows 
per bucket = `ceil(64/4)  = 16`.  
 
@@ -158,19 +158,19 @@ In this example, there are 10 buckets. Each bucket 
contains 150 rows, which is c
 ## Limitations  
 
 - Drill does not cache statistics. 
-- ANALYZE TABLE runs only on directory-based Parquet tables. 
-- ANALYZE TABLE cannot do the following:  
+- ANALYZE TABLE COMPUTE STATISTICS runs only on directory-based Parquet 
tables. 
+- ANALYZE TABLE COMPUTE STATISTICS cannot do the following:  
        - compute statistics on schema-less file formats, such as text and CSV
        - provide up-to-date statistics for operational data due to potential 
mismatches that can occur between operational updates and manually running 
ANALYZE TABLE  
-- Running the ANALYZE TABLE statement against multiple files in which some of 
the files have null values and others have no null values may return the 
following generic Drill error, which is not specific to the ANALYZE command:  
+- Running the ANALYZE TABLE COMPUTE STATISTICS statement against multiple 
files in which some of the files have null values and others have no null 
values may return the following generic Drill error, which is not specific to 
the ANALYZE command:  
  
                Error: SYSTEM ERROR: IllegalStateException: Failure while 
reading vector. 
                 Expected vector class of 
org.apache.drill.exec.vector.NullableBigIntVector
                but was holding vector class 
org.apache.drill.exec.vector.IntVector, field= [`o_custkey` (INT:REQUIRED)] 
  
-               //If you encounter this error, run the ANALYZE TABLE statement 
on each file with null values individually instead of running the statement 
against all the files at once.  
+               //If you encounter this error, run the ANALYZE TABLE COMPUTE 
STATISTICS statement on each file with null values individually instead of 
running the statement against all the files at once.  
 
--   Running the ANALYZE TABLE statement creates the stats file, which changes 
the directory timestamp. The change of the timestamp automatically  triggers 
the REFRESH TABLE METADATA command, even when the underlying data has not 
changed.  
+-   Running the ANALYZE TABLE COMPUTE STATISTICS statement creates the stats 
file, which changes the directory timestamp. The change of the timestamp 
automatically  triggers the REFRESH TABLE METADATA command, even when the 
underlying data has not changed.  
 
 ## EXAMPLES  
 
@@ -201,7 +201,7 @@ The following query shows the columns and types of data in 
the “customer” ta
 
  
 ###Enabling Statistics for Query Planning
-You can run the ANALYZE TABLE statement at any time to compute statistics; 
however, you must enable the following option if you want Drill to use 
statistics during query planning:
+You can run the ANALYZE TABLE COMPUTE STATISTICS statement at any time to 
compute statistics; however, you must enable the following option if you want 
Drill to use statistics during query planning:
  
        set `planner.statistics.use`=true;
        +------+---------------------------------+
@@ -213,7 +213,7 @@ You can run the ANALYZE TABLE statement at any time to 
compute statistics; howev
 ###Computing Statistics
 You can compute statistics on directories with Parquet data or on Parquet 
tables.
  
-You can run the ANALYZE TABLE statement on a subset of columns to generate 
statistics for those columns only, as shown:
+You can run the ANALYZE TABLE COMPUTE STATISTICS statement on a subset of 
columns to generate statistics for those columns only, as shown:
  
        analyze table `table_stats/Tpch0.01/parquet/customer` compute 
statistics (c_custkey, c_nationkey, c_acctbal);
        +----------+---------------------------+
@@ -222,7 +222,7 @@ You can run the ANALYZE TABLE statement on a subset of 
columns to generate stati
        | 0_0      | 3                         |
        +----------+---------------------------+
  
-Or, you can run the ANALYZE TABLE statement on the entire table/directory if 
you want statistics generated for all the columns:
+Or, you can run the ANALYZE TABLE COMPUTE STATISTICS statement on the entire 
table/directory if you want statistics generated for all the columns:
  
        analyze table `table_stats/Tpch0.01/parquet/customer` compute 
statistics;
        +----------+---------------------------+
@@ -234,7 +234,7 @@ Or, you can run the ANALYZE TABLE statement on the entire 
table/directory if you
 
  
 ###Computing Statistics on a SAMPLE
-You can also run ANALYZE TABLE on a percentage of the data using the SAMPLE 
command, as shown:
+You can also run ANALYZE TABLE COMPUTE STATISTICS on a percentage of the data 
using the SAMPLE command, as shown:
  
        ANALYZE TABLE `table_stats/Tpch0.01/parquet/customer` COMPUTE 
STATISTICS SAMPLE 50 PERCENT;
        +----------+---------------------------+
@@ -247,9 +247,9 @@ You can also run ANALYZE TABLE on a percentage of the data 
using the SAMPLE comm
 ###Storing Statistics
 When you generate statistics, a statistics directory (`.stats.drill`) is 
created with a JSON file that contains the statistical data.
  
-For tables, the `.stats.drill` directory is nested within the table directory. 
For example, if you ran ANALYZE TABLE against a table named “customer,” you 
could access the statistic file in `/customer/.stats.drill`. The JSON file is 
stored in the `.stats.drill` directory.
+For tables, the `.stats.drill` directory is nested within the table directory. 
For example, if you ran ANALYZE TABLE COMPUTE STATISTICS against a table named 
“customer,” you could access the statistic file in `/customer/.stats.drill`. 
The JSON file is stored in the `.stats.drill` directory.
  
-For directories, a new directory is written with the same name as the 
directory on which you ran ANALYZE TABLE, appended by `.stats.drill`. For 
example, if you ran ANALYZE TABLE against a directory named “customer,” you 
could access the JSON statistics file in the new `customer.stats.drill` 
directory.
+For directories, a new directory is written with the same name as the 
directory on which you ran ANALYZE TABLE COMPUTE STATISTICS, appended by 
`.stats.drill`. For example, if you ran ANALYZE TABLE COMPUTE STATISTICS 
against a directory named “customer,” you could access the JSON statistics file 
in the new `customer.stats.drill` directory.
  
 You can query the statistics file to see the statistics generated for each 
column, as shown in the following two examples:
 
@@ -272,7 +272,7 @@ You can query the statistics file to see the statistics 
generated for each colum
 
 
 ###Dropping Statistics
-If you want to compute statistics on a table or directory that you have 
already run the ANALYZE TABLE statement against, you must first drop the 
statistics before you can run ANALYZE TABLE statement on the table again.
+If you want to compute statistics on a table or directory that you have 
already run the ANALYZE TABLE COMPUTE STATISTICS statement against, you must 
first drop the statistics before you can run ANALYZE TABLE statement on the 
table again.
  
 The following example demonstrates how to drop statistics on a table:
  
@@ -303,9 +303,9 @@ When you drop statistics, the statistics directory no 
longer exists for the tabl
 
 ## Troubleshooting  
 
-Typical errors you may get when running ANALYZE TABLE result from running the 
statement against an individual file or against a data source other than 
Parquet, as shown in the following examples:
+Typical errors you may get when running ANALYZE TABLE COMPUTE STATISTICS 
result from running the statement against an individual file or against a data 
source other than Parquet, as shown in the following examples:
 
-**Running ANALYZE TABLE on a file.**  
+**Running ANALYZE TABLE COMPUTE STATISTICS on a file.**  
   
        ANALYZE TABLE `/parquet/nation.parquet` COMPUTE STATISTICS;
        
+--------+----------------------------------------------------------------------------------+
@@ -315,7 +315,7 @@ Typical errors you may get when running ANALYZE TABLE 
result from running the st
        
+--------+----------------------------------------------------------------------------------+
 
 
-**Running ANALYZE TABLE on a data source other than Parquet.**
+**Running ANALYZE TABLE COMPUTE STATISTICS on a data source other than 
Parquet.**
 
        ANALYZE TABLE nation1_json COMPUTE STATISTICS;
        
+--------+----------------------------------------------------------------------------------+
@@ -323,18 +323,3 @@ Typical errors you may get when running ANALYZE TABLE 
result from running the st
        
+--------+----------------------------------------------------------------------------------+
        | false  | Table nation1_json is not supported by ANALYZE. Support is 
currently limited to directory-based Parquet tables. |
        
+--------+----------------------------------------------------------------------------------+
-
-
-
-
-
-
-
-
-  
-
-
-
-
-
-

[drill] 01/03: Add docs for Drill Metastore

Reply via email to