This is an automated email from the ASF dual-hosted git repository. bridgetb pushed a commit to branch gh-pages in repository https://gitbox.apache.org/repos/asf/drill.git
commit fd42f80df2fab5a1a23a88ca96e3d59026091584 Author: Volodymyr Vysotskyi <vvo...@gmail.com> AuthorDate: Mon Jan 13 16:43:10 2020 +0200 Add docs for Drill Metastore --- _data/docs.json | 156 +++++++++++++++----- _docs/getting-started/010-drill-introduction.md | 4 +- _docs/rn/005-1.16.0-rn.md | 4 +- .../007-analyze-table-refresh-metadata.md | 158 +++++++++++++++++++++ ....md => 009-analyze-table-compute-statistics.md} | 81 +++++------ 5 files changed, 315 insertions(+), 88 deletions(-) diff --git a/_data/docs.json b/_data/docs.json index b8f927f..f74ef35 100644 --- a/_data/docs.json +++ b/_data/docs.json @@ -21,7 +21,7 @@ "title": "ALTER SYSTEM", "url": "/docs/alter-system/" }, - "ANALYZE TABLE": { + "ANALYZE TABLE COMPUTE STATISTICS": { "breadcrumbs": [ { "title": "SQL Commands", @@ -36,11 +36,32 @@ "next_title": "SET", "next_url": "/docs/set/", "parent": "SQL Commands", + "previous_title": "ANALYZE TABLE REFRESH METADATA", + "previous_url": "/docs/analyze-table-refresh-metadata/", + "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", + "title": "ANALYZE TABLE COMPUTE STATISTICS", + "url": "/docs/analyze-table-compute-statistics/" + }, + "ANALYZE TABLE REFRESH METADATA": { + "breadcrumbs": [ + { + "title": "SQL Commands", + "url": "/docs/sql-commands/" + }, + { + "title": "SQL Reference", + "url": "/docs/sql-reference/" + } + ], + "children": [], + "next_title": "ANALYZE TABLE COMPUTE STATISTICS", + "next_url": "/docs/analyze-table-compute-statistics/", + "parent": "SQL Commands", "previous_title": "Supported SQL Commands", "previous_url": "/docs/supported-sql-commands/", - "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table.md", - "title": "ANALYZE TABLE", - "url": "/docs/analyze-table/" + "relative_path": "_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", + "title": "ANALYZE TABLE REFRESH METADATA", + "url": "/docs/analyze-table-refresh-metadata/" }, "About SQL Function Examples": { "breadcrumbs": [ @@ -11315,8 +11336,8 @@ "next_title": "REFRESH TABLE METADATA", "next_url": "/docs/refresh-table-metadata/", "parent": "SQL Commands", - "previous_title": "ANALYZE TABLE", - "previous_url": "/docs/analyze-table/", + "previous_title": "ANALYZE TABLE COMPUTE STATISTICS", + "previous_url": "/docs/analyze-table-compute-statistics/", "relative_path": "_docs/sql-reference/sql-commands/010-set.md", "title": "SET", "url": "/docs/set/" @@ -11404,8 +11425,8 @@ } ], "children": [], - "next_title": "ANALYZE TABLE", - "next_url": "/docs/analyze-table/", + "next_title": "ANALYZE TABLE REFRESH METADATA", + "next_url": "/docs/analyze-table-refresh-metadata/", "parent": "SQL Commands", "previous_title": "SQL Commands", "previous_url": "/docs/sql-commands/", @@ -11425,14 +11446,35 @@ } ], "children": [], - "next_title": "SET", - "next_url": "/docs/set/", + "next_title": "ANALYZE TABLE COMPUTE STATISTICS", + "next_url": "/docs/analyze-table-compute-statistics/", "parent": "SQL Commands", "previous_title": "Supported SQL Commands", "previous_url": "/docs/supported-sql-commands/", - "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table.md", - "title": "ANALYZE TABLE", - "url": "/docs/analyze-table/" + "relative_path": "_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", + "title": "ANALYZE TABLE REFRESH METADATA", + "url": "/docs/analyze-table-refresh-metadata/" + }, + { + "breadcrumbs": [ + { + "title": "SQL Commands", + "url": "/docs/sql-commands/" + }, + { + "title": "SQL Reference", + "url": "/docs/sql-reference/" + } + ], + "children": [], + "next_title": "SET", + "next_url": "/docs/set/", + "parent": "SQL Commands", + "previous_title": "ANALYZE TABLE REFRESH METADATA", + "previous_url": "/docs/analyze-table-refresh-metadata/", + "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", + "title": "ANALYZE TABLE COMPUTE STATISTICS", + "url": "/docs/analyze-table-compute-statistics/" }, { "breadcrumbs": [ @@ -11449,8 +11491,8 @@ "next_title": "REFRESH TABLE METADATA", "next_url": "/docs/refresh-table-metadata/", "parent": "SQL Commands", - "previous_title": "ANALYZE TABLE", - "previous_url": "/docs/analyze-table/", + "previous_title": "ANALYZE TABLE COMPUTE STATISTICS", + "previous_url": "/docs/analyze-table-compute-statistics/", "relative_path": "_docs/sql-reference/sql-commands/010-set.md", "title": "SET", "url": "/docs/set/" @@ -13067,8 +13109,8 @@ } ], "children": [], - "next_title": "ANALYZE TABLE", - "next_url": "/docs/analyze-table/", + "next_title": "ANALYZE TABLE REFRESH METADATA", + "next_url": "/docs/analyze-table-refresh-metadata/", "parent": "SQL Commands", "previous_title": "SQL Commands", "previous_url": "/docs/sql-commands/", @@ -13088,14 +13130,35 @@ } ], "children": [], - "next_title": "SET", - "next_url": "/docs/set/", + "next_title": "ANALYZE TABLE COMPUTE STATISTICS", + "next_url": "/docs/analyze-table-compute-statistics/", "parent": "SQL Commands", "previous_title": "Supported SQL Commands", "previous_url": "/docs/supported-sql-commands/", - "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table.md", - "title": "ANALYZE TABLE", - "url": "/docs/analyze-table/" + "relative_path": "_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", + "title": "ANALYZE TABLE REFRESH METADATA", + "url": "/docs/analyze-table-refresh-metadata/" + }, + { + "breadcrumbs": [ + { + "title": "SQL Commands", + "url": "/docs/sql-commands/" + }, + { + "title": "SQL Reference", + "url": "/docs/sql-reference/" + } + ], + "children": [], + "next_title": "SET", + "next_url": "/docs/set/", + "parent": "SQL Commands", + "previous_title": "ANALYZE TABLE REFRESH METADATA", + "previous_url": "/docs/analyze-table-refresh-metadata/", + "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", + "title": "ANALYZE TABLE COMPUTE STATISTICS", + "url": "/docs/analyze-table-compute-statistics/" }, { "breadcrumbs": [ @@ -13112,8 +13175,8 @@ "next_title": "REFRESH TABLE METADATA", "next_url": "/docs/refresh-table-metadata/", "parent": "SQL Commands", - "previous_title": "ANALYZE TABLE", - "previous_url": "/docs/analyze-table/", + "previous_title": "ANALYZE TABLE COMPUTE STATISTICS", + "previous_url": "/docs/analyze-table-compute-statistics/", "relative_path": "_docs/sql-reference/sql-commands/010-set.md", "title": "SET", "url": "/docs/set/" @@ -14900,8 +14963,8 @@ } ], "children": [], - "next_title": "ANALYZE TABLE", - "next_url": "/docs/analyze-table/", + "next_title": "ANALYZE TABLE REFRESH METADATA", + "next_url": "/docs/analyze-table-refresh-metadata/", "parent": "SQL Commands", "previous_title": "SQL Commands", "previous_url": "/docs/sql-commands/", @@ -20703,8 +20766,8 @@ } ], "children": [], - "next_title": "ANALYZE TABLE", - "next_url": "/docs/analyze-table/", + "next_title": "ANALYZE TABLE REFRESH METADATA", + "next_url": "/docs/analyze-table-refresh-metadata/", "parent": "SQL Commands", "previous_title": "SQL Commands", "previous_url": "/docs/sql-commands/", @@ -20724,14 +20787,35 @@ } ], "children": [], - "next_title": "SET", - "next_url": "/docs/set/", + "next_title": "ANALYZE TABLE COMPUTE STATISTICS", + "next_url": "/docs/analyze-table-compute-statistics/", "parent": "SQL Commands", "previous_title": "Supported SQL Commands", "previous_url": "/docs/supported-sql-commands/", - "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table.md", - "title": "ANALYZE TABLE", - "url": "/docs/analyze-table/" + "relative_path": "_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md", + "title": "ANALYZE TABLE REFRESH METADATA", + "url": "/docs/analyze-table-refresh-metadata/" + }, + { + "breadcrumbs": [ + { + "title": "SQL Commands", + "url": "/docs/sql-commands/" + }, + { + "title": "SQL Reference", + "url": "/docs/sql-reference/" + } + ], + "children": [], + "next_title": "SET", + "next_url": "/docs/set/", + "parent": "SQL Commands", + "previous_title": "ANALYZE TABLE REFRESH METADATA", + "previous_url": "/docs/analyze-table-refresh-metadata/", + "relative_path": "_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md", + "title": "ANALYZE TABLE COMPUTE STATISTICS", + "url": "/docs/analyze-table-compute-statistics/" }, { "breadcrumbs": [ @@ -20748,8 +20832,8 @@ "next_title": "REFRESH TABLE METADATA", "next_url": "/docs/refresh-table-metadata/", "parent": "SQL Commands", - "previous_title": "ANALYZE TABLE", - "previous_url": "/docs/analyze-table/", + "previous_title": "ANALYZE TABLE COMPUTE STATISTICS", + "previous_url": "/docs/analyze-table-compute-statistics/", "relative_path": "_docs/sql-reference/sql-commands/010-set.md", "title": "SET", "url": "/docs/set/" @@ -22720,4 +22804,4 @@ "url": "/docs/ecosystem/" } ] -} \ No newline at end of file +} diff --git a/_docs/getting-started/010-drill-introduction.md b/_docs/getting-started/010-drill-introduction.md index 7035a98..5ab61aa 100644 --- a/_docs/getting-started/010-drill-introduction.md +++ b/_docs/getting-started/010-drill-introduction.md @@ -1,6 +1,6 @@ --- title: "Drill Introduction" -date: 2019-12-26 +date: 2020-01-13 parent: "Getting Started" --- Drill is an Apache open-source SQL query engine for Big Data exploration. @@ -46,7 +46,7 @@ Metastore support: ## What's New in Apache Drill 1.16 -- [ANALYZE TABLE statement]({{site.baseurl}}/docs/analyze-table/) to computes statistics on Parquet data ([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328)) +- [ANALYZE TABLE statement]({{site.baseurl}}/docs/analyze-table-compute-statistics/) to computes statistics on Parquet data ([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328)) - [CREATE OR REPLACE SCHEMA command]({{site.baseurl}}/docs/create-or-replace-schema/) to define a schema for text files ([DRILL-6964](https://issues.apache.org/jira/browse/DRILL-6964)) - [REFRESH TABLE METADATA command]({{site.baseurl}}/docs/refresh-table-metadata/) can generate metadata cache files for specific columns ([DRILL-7058](https://issues.apache.org/jira/browse/DRILL-7058)) - [SYSLOG (RFC-5424) Format Plugin]({{site.baseurl}}/docs/syslog-format-plugin/) ([DRILL-6582](https://issues.apache.org/jira/browse/DRILL-6582)) diff --git a/_docs/rn/005-1.16.0-rn.md b/_docs/rn/005-1.16.0-rn.md index ac07ab7..88fabd7 100644 --- a/_docs/rn/005-1.16.0-rn.md +++ b/_docs/rn/005-1.16.0-rn.md @@ -12,7 +12,7 @@ Today, we're happy to announce the availability of Drill 1.16.0. You can downloa This release of Drill provides the following new features and improvements: -- [ANALYZE TABLE statement]({{site.baseurl}}/docs/analyze-table/) to computes statistics on Parquet data ([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328)) +- [ANALYZE TABLE statement]({{site.baseurl}}/docs/analyze-table-compute-statistics/) to computes statistics on Parquet data ([DRILL-1328](https://issues.apache.org/jira/browse/DRILL-1328)) - [CREATE OR REPLACE SCHEMA command]({{site.baseurl}}/docs/create-or-replace-schema/) to define a schema for text files ([DRILL-6964](https://issues.apache.org/jira/browse/DRILL-6964)) - [REFRESH TABLE METADATA command]({{site.baseurl}}/docs/refresh-table-metadata/) can generate metadata cache files for specific columns ([DRILL-7058](https://issues.apache.org/jira/browse/DRILL-7058)) - [SYSLOG (RFC-5424) Format Plugin]({{site.baseurl}}/docs/syslog-format-plugin/) ([DRILL-6582](https://issues.apache.org/jira/browse/DRILL-6582)) @@ -498,4 +498,4 @@ The following sections provide a complete list of all the fixes and improvements </li> </ul> - \ No newline at end of file + diff --git a/_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md b/_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md new file mode 100644 index 0000000..21fa711 --- /dev/null +++ b/_docs/sql-reference/sql-commands/007-analyze-table-refresh-metadata.md @@ -0,0 +1,158 @@ +--- +title: "ANALYZE TABLE REFRESH METADATA" +parent: "SQL Commands" +date: 2020-01-13 +--- + +Starting from Drill 1.17, you can store table metadata (including schema and computed statistics) into Drill Metastore. +This metadata will be used when querying a table for more optimal plan creation. + +{% include startnote.html %}In Drill 1.17, this feature is supported for Parquet tables only and is disabled by default.{% include endnote.html %} + +To enable Drill Metastore usage, the following option `metastore.enabled` should be set to `true`, as shown: + + SET `metastore.enabled` = true; + +Alternatively, you can enable the option in the Drill Web UI at `http://<drill-hostname-or-ip-address>:8047/options`. + +## Syntax + +The ANALYZE TABLE REFRESH METADATA statement supports the following syntax: + + ANALYZE TABLE [table_name] [COLUMNS {(col1, col2, ...) | NONE}] + REFRESH METADATA ['level' LEVEL] + [{COMPUTE | ESTIMATE} | STATISTICS [(column1, column2, ...)] + [ SAMPLE number PERCENT ]] + +## Parameters + +*table_name* +The name of the table or directory for which Drill will collect table metadata. If the table does not exist, or the table + is temporary, the command fails and metadata is not collected and stored. + +*COLUMNS (col1, col2, ...)* +Optional names of the column(s) for which Drill will generate and store metadata. the Stored schema will include all table columns. + +*COLUMNS NONE* +Specifies to ignore collecting and storing metadata for all table columns. + +*level* +Optional varchar literal which specifies maximum level depth for collecting metadata. +Possible values: `TABLE`, `SEGMENT`, `PARTITION`, `FILE`, `ROW_GROUP`, `ALL`. Default is `ALL`. + +*COMPUTE* +Generates statistics for the table to be stored into the Metastore. +If statistics usage is disabled (`planner.enable_statistics` is set to `false`), an error will be thrown when this clause is specified. + +*ESTIMATE* +Generates estimated statistics for the table to be stored into the Metastore. Currently is not supported. + +*(column1, column2, ...)* +The name of the column(s) for which Drill will generate statistics. + +*SAMPLE* +Optional. Indicates that compute statistics should run on a subset of the data. + +*number PERCENT* +An integer that specifies the percentage of data on which to compute statistics. For example, if a table has 100 rows, `SAMPLE 50 PERCENT` indicates that statistics should be computed on 50 rows. The optimizer selects the rows at random. + +## Related Options + +- **metastore.enabled** +Enables Drill Metastore usage to be able to store table metadata during ANALYZE TABLE commands execution and to be able + to read table metadata during regular queries execution or when querying some INFORMATION_SCHEMA tables. Default is `false`. +- **metastore.metadata.store.depth_level** +Specifies maximum level depth for collecting metadata. Default is `'ALL'`. +- **metastore.retrieval.retry_attempts** +Specifies the number of attempts for retrying query planning after detecting that query metadata is changed. +If the number of retries was exceeded, query will be planned without metadata information from the Metastore. Default is 5. +- **metastore.metadata.fallback_to_file_metadata** +Allows using file metadata cache for the case when required metadata is absent in the Metastore. Default is true. +- **metastore.metadata.use_schema** +Enables schema usage, stored to the Metastore. Default is `true`. +- **metastore.metadata.use_statistics** +Enables statistics usage, stored in the Metastore, at the planning stage. Default is `true`. +- **metastore.metadata.ctas.auto-collect** +Specifies whether schema and / or column statistics will be automatically collected for every table after CTAS and CTTAS. +This option is not active for now. Default is `'NONE'`. +- **drill.exec.storage.implicit.last_modified_time.column.label** +Sets the implicit column name for the last modified time (`lmt`) column. For internal usage when producing Metastore analyze. +- **drill.exec.storage.implicit.row_group_index.column.label** +Sets the implicit column name for the row group index (`rgi`) column. For internal usage when producing Metastore analyze. +- **drill.exec.storage.implicit.row_group_length.column.label** +Sets the implicit column name for the row group length (`rgl`) column. For internal usage when producing Metastore analyze. +- **drill.exec.storage.implicit.row_group_start.column.label** +Sets the implicit column name for the row group start (`rgs`) column. For internal usage when producing Metastore analyze. + +## Related Commands + +To drop table metadata from the Metastore, the following command may be used: + + ANALYZE TABLE [table_name] DROP [METADATA|STATISTICS] [IF EXISTS] + +It will not throw an exception for absent table metadata if `IF EXISTS` clause was specified. + + ANALYZE TABLE [workspace.]table_name COMPUTE STATISTICS [(column1, column2,...)] [SAMPLE number PERCENT] + +See [ANALYZE TABLE COMPUTE STATISTICS]({{site.baseurl}}/docs/analyze-table-compute-statistics). + +## Usage Notes + +### General Information + +- Currently `ANALYZE TABLE REFRESH METADATA` statement can compute and store metadata only for Parquet tables within `dfs` storage plugins. +- For the case when `ANALYZE TABLE REFRESH METADATA` command is executed for the first time, whole table metadata will be collected and stored into Metastore. +If analyze was already executed for the table, and table data wasn't changed, all further analyze commands wouldn't trigger table analyzing and message that table metadata is up to date will be returned. + +### Incremental analyze + +For the case when some table data was updated, Drill will try to execute incremental analyze - calculate metadata only for updated data and reuse required metadata from the Metastore. + +Incremental analyze wouldn't be produced for the following cases: +- list of interesting columns specified in analyze is not a subset of interesting columns from the previous analyze; +- specified metadata level differs from the metadata level in previous analyze. + +### Metadata usage + +Drill provides the ability to use metadata obtained from the Metastore at the planning stage to prune segments, files + and row groups. + +Tables metadata from the Metastore is exposed to `INFORMATION_SCHEMA` tables (if Metastore usage is enabled). + +The following tables are populated with table metadata from the Metastore: + +`TABLES` table has the following additional columns populated from the Metastore: + +- `TABLE_SOURCE` - table data type: `PARQUET`, `CSV`, `JSON` +- `LOCATION` - table location: `/tmp/nation` +- `NUM_ROWS` - number of rows in a table if known, `null` if not known +- `LAST_MODIFIED_TIME` - table's last modification time + +`COLUMNS` table has the following additional columns populated from the Metastore: + +- `COLUMN_DEFAULT` - column default value +- `COLUMN_FORMAT` - usually applicable for date time columns: `yyyy-MM-dd` +- `NUM_NULLS` - number of nulls in column values +- `MIN_VAL` - column min value in String representation: `aaa` +- `MAX_VAL` - column max value in String representation: `zzz` +- `NDV` - number of distinct values in column, expressed in Double +- `EST_NUM_NON_NULLS` - estimated number of non null values, expressed in Double +- `IS_NESTED` - if column is nested. Nested columns are extracted from columns with struct type. + +`PARTITIONS` table has the following additional columns populated from the Metastore: + +- `TABLE_CATALOG` - table catalog (currently we have only one catalog): `DRILL` +- `TABLE_SCHEMA` - table schema: `dfs.tmp` +- `TABLE_NAME` - table name: `nation` +- `METADATA_KEY` - top level segment key, the same for all nested segments and partitions: `part_int=3` +- `METADATA_TYPE` - `SEGMENT` or `PARTITION` +- `METADATA_IDENTIFIER` - current metadata identifier: `part_int=3/part_varchar=g` +- `PARTITION_COLUMN` - partition column name: `part_varchar` +- `PARTITION_VALUE` - partition column value: `g` +- `LOCATION` - segment location, `null` for partitions: `/tmp/nation/part_int=3` +- `LAST_MODIFIED_TIME` - last modification time + +## Limitations + +This feature is currently in the alpha phase (preview, experimental) for Drill 1.17 and only applies to Parquet + tables in this release. You must enable this feature through the `metastore.enabled` system/session option. diff --git a/_docs/sql-reference/sql-commands/009-analyze-table.md b/_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md similarity index 75% rename from _docs/sql-reference/sql-commands/009-analyze-table.md rename to _docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md index 9251d0e..a642a40 100644 --- a/_docs/sql-reference/sql-commands/009-analyze-table.md +++ b/_docs/sql-reference/sql-commands/009-analyze-table-compute-statistics.md @@ -1,11 +1,11 @@ --- -title: "ANALYZE TABLE" +title: "ANALYZE TABLE COMPUTE STATISTICS" date: 2019-05-31 parent: "SQL Commands" --- -Drill 1.16 and later supports the ANALYZE TABLE statement. The ANALYZE TABLE statement computes statistics on Parquet data stored in tables and directories. The optimizer in Drill uses statistics to estimate filter, aggregation, and join cardinalities and create an optimal query plan. -ANALYZE TABLE writes statistics to a JSON file in the `.stats.drill` directory, for example `/user/table1/.stats.drill/0_0.json`. +Drill 1.16 and later supports the ANALYZE TABLE COMPUTE STATISTICS statement. The ANALYZE TABLE COMPUTE STATISTICS statement computes statistics on Parquet data stored in tables and directories. The optimizer in Drill uses statistics to estimate filter, aggregation, and join cardinalities and create an optimal query plan. +ANALYZE TABLE COMPUTE STATISTICS writes statistics to a JSON file in the `.stats.drill` directory, for example `/user/table1/.stats.drill/0_0.json`. Drill will not use the statistics for query planning unless you enable the `planner.statistics.use` option, as shown: @@ -15,7 +15,7 @@ Alternatively, you can enable the option in the Drill Web UI at `http://<drill-h ## Syntax -The ANALYZE TABLE statement supports the following syntax: +The ANALYZE TABLE COMPUTE STATISTICS statement supports the following syntax: ANALYZE TABLE [workspace.]table_name COMPUTE STATISTICS [(column1, column2,...)] [SAMPLE number PERCENT] @@ -42,7 +42,7 @@ An integer that specifies the percentage of data on which to compute statistics. ## Related Command -If you drop a table that you have already run ANALYZE TABLE against, the statistics are automatically removed with the table: +If you drop a table that you have already run ANALYZE TABLE COMPUTE STATISTICS against, the statistics are automatically removed with the table: DROP TABLE [IF EXISTS] [workspace.]name @@ -50,7 +50,7 @@ To remove statistics for a table you want to keep, you must remove the directory DROP TABLE [IF EXISTS] [workspace.]name/.stats.drill -If you have already issued the ANALYZE TABLE statement against specific columns, table, or directory, you must run the DROP TABLE statement with `/.stats.drill` before you can successfully run the ANALYZE TABLE statement against the data source again, for example: +If you have already issued the ANALYZE TABLE COMPUTE STATISTICS statement against specific columns, table, or directory, you must run the DROP TABLE statement with `/.stats.drill` before you can successfully run the ANALYZE TABLE COMPUTE STATISTICS statement against the data source again, for example: DROP TABLE `table_stats/Tpch0.01/parquet/customer/.stats.drill`; @@ -60,30 +60,30 @@ Note that `/.stats.drill` is the directory to which the JSON file with statistic ## Usage Notes -- The ANALYZE TABLE statement can compute statistics for Parquet data stored in tables, columns, and directories within dfs storage plugins only. -- The user running the ANALYZE TABLE statement must have read and write permissions on the data source. +- The ANALYZE TABLE COMPUTE STATISTICS statement can compute statistics for Parquet data stored in tables, columns, and directories within dfs storage plugins only. +- The user running the ANALYZE TABLE COMPUTE STATISTICS statement must have read and write permissions on the data source. - The optimizer in Drill computes the following types of statistics for each column: - Rowcount (total number of entries in the table) - Nonnullrowcount (total number of non-null entries in the table) - NDV (total distinct values in the table) - Avgwidth (average width, in bytes, of a column) - Majortype (data type and data mode (OPTIONAL, REQUIRED, REPEATED) of the column values) - - Histogram (represents the frequency distribution of values (numeric data) in a column) See [Histograms]({{site.baseurl}}/docs/analyze-table/#histograms). + - Histogram (represents the frequency distribution of values (numeric data) in a column) See [Histograms]({{site.baseurl}}/docs/analyze-table-compute-statistics/#histograms). - When you look at the statistics file, statistics for each column display in the following format (c_nationkey is used as an example column): {"column":"`c_nationkey`","majortype":{"type":"INT","mode":"REQUIRED"},"schema":1.0,"rowcount":1500.0,"nonnullrowcount":1500.0,"ndv":25,"avgwidth":4.0,"histogram":{"category":"numeric-equi-depth","numRowsPerBucket":150,"buckets":[0.0,2.0,4.0,7.0,9.0,12.0,15.199999999999978,17.0,19.0,22.0,24.0]}} -- ANALYZE TABLE can compute statistics on nested scalar columns; however, you must explicitly state the columns, for example: +- ANALYZE TABLE COMPUTE STATISTICS can compute statistics on nested scalar columns; however, you must explicitly state the columns, for example: `ANALYZE TABLE employee_table COMPUTE STATISTICS (name.firstname, name.lastname);` -- ANALYZE TABLE can compute statistics at the root directory level, but not at the partition level. +- ANALYZE TABLE COMPUTE STATISTICS can compute statistics at the root directory level, but not at the partition level. Drill does not compute statistics for complex types (maps, arrays). ## Related Options -You can set the following options related to the ANALYZE TABLE statement at the system or session level with the SET (session level) or ALTER SYSTEM SET (system level) statements, or through the Drill Web UI at `http://<drill-hostname-or-ip>:8047/options`: +You can set the following options related to the ANALYZE TABLE COMPUTE STATISTICS statement at the system or session level with the SET (session level) or ALTER SYSTEM SET (system level) statements, or through the Drill Web UI at `http://<drill-hostname-or-ip>:8047/options`: - **planner.statistics.use** -Enables the query planner to use statistics. When disabled, ANALYZE TABLE generates statistics, but the query planner will not use the statistics unless this option is enabled. Disabled (false) by default. +Enables the query planner to use statistics. When disabled, ANALYZE TABLE COMPUTE STATISTICS generates statistics, but the query planner will not use the statistics unless this option is enabled. Disabled (false) by default. - **exec.statistics.ndv_accuracy** Controls the trade-off between NDV statistic computation memory cost and accuracy. Controls the amount of memory for estimates. More memory produces more accurate estimates. The default value should suffice for most scenarios. Default is 20. Range is 0- 30. - **exec.statistics.ndv_extrapolation_bf_elements** @@ -99,7 +99,7 @@ Controls the 'compression' factor for the TDigest algorithm used for histogram s ## Reserved Keywords -The ANALYZE TABLE statement introduces the following reserved keywords: +The ANALYZE TABLE COMPUTE STATISTICS statement introduces the following reserved keywords: Analyze Compute @@ -112,11 +112,11 @@ If you use any of these words in a Drill query, you must enclose the word in bac SELECT * FROM `estimate`; -## ANALYZE TABLE Performance +## ANALYZE TABLE COMPUTE STATISTICS Performance -- After you run the ANALYZE TABLE statement, you can view the profile for ANALYZE in the Drill Web UI. Go to `http://<drill-hostname-or-ip>:8047/profiles`, and click the ANALYZE TABLE statement for which you want to view the profile. +- After you run the ANALYZE TABLE COMPUTE STATISTICS statement, you can view the profile for ANALYZE in the Drill Web UI. Go to `http://<drill-hostname-or-ip>:8047/profiles`, and click the ANALYZE TABLE statement for which you want to view the profile. - Should you notice any performance issues, you may want to decrease the value of the `planner.slice_target` option. -- Generating statistics on large data sets can consume time and resources, such as memory and CPU. ANALYZE TABLE can compute statistics on a sample (subset of the data indicated as a percentage) to limit the amount of resources needed for computation. Drill still scans the entire data set, but only computes on the rows selected for sampling. Rows are randomly selected for the sample. Note that the quality of statistics increases with the sample size. +- Generating statistics on large data sets can consume time and resources, such as memory and CPU. ANALYZE TABLE COMPUTE STATISTICS can compute statistics on a sample (subset of the data indicated as a percentage) to limit the amount of resources needed for computation. Drill still scans the entire data set, but only computes on the rows selected for sampling. Rows are randomly selected for the sample. Note that the quality of statistics increases with the sample size. ## Queries that Benefit from Statistics Typically, the types of queries that benefit from statistics are those that include: @@ -132,7 +132,7 @@ Typically, the types of queries that benefit from statistics are those that incl Histograms show the distribution of data to determine if data is skewed or normally distributed. Histogram statistics improve the selectivity estimates used by the optimizer to create the most efficient query plans possible. Histogram statistics are useful for range predicates to help determine how many rows belong to a particular range. -Running the ANALYZE TABLE statement generates equi-depth histogram statistics on each column in a table. Equi-depth histograms distribute distinct column values across buckets of varying widths, with all buckets having approximately the same number of rows. The fixed number of rows per bucket is predetermined by `ceil(number_rows/n)`, where `n` is the number of buckets. The number of distinct values in each bucket depends on the distribution of the values in a column. Equi-depth histogra [...] +Running the ANALYZE TABLE COMPUTE STATISTICS statement generates equi-depth histogram statistics on each column in a table. Equi-depth histograms distribute distinct column values across buckets of varying widths, with all buckets having approximately the same number of rows. The fixed number of rows per bucket is predetermined by `ceil(number_rows/n)`, where `n` is the number of buckets. The number of distinct values in each bucket depends on the distribution of the values in a column. [...] The following diagram shows the column values on the horizontal axis and the individual frequencies (dark blue) and total frequency of a bucket (light blue). In this example, the total number of rows = 64, hence the number of rows per bucket = `ceil(64/4) = 16`. @@ -158,19 +158,19 @@ In this example, there are 10 buckets. Each bucket contains 150 rows, which is c ## Limitations - Drill does not cache statistics. -- ANALYZE TABLE runs only on directory-based Parquet tables. -- ANALYZE TABLE cannot do the following: +- ANALYZE TABLE COMPUTE STATISTICS runs only on directory-based Parquet tables. +- ANALYZE TABLE COMPUTE STATISTICS cannot do the following: - compute statistics on schema-less file formats, such as text and CSV - provide up-to-date statistics for operational data due to potential mismatches that can occur between operational updates and manually running ANALYZE TABLE -- Running the ANALYZE TABLE statement against multiple files in which some of the files have null values and others have no null values may return the following generic Drill error, which is not specific to the ANALYZE command: +- Running the ANALYZE TABLE COMPUTE STATISTICS statement against multiple files in which some of the files have null values and others have no null values may return the following generic Drill error, which is not specific to the ANALYZE command: Error: SYSTEM ERROR: IllegalStateException: Failure while reading vector. Expected vector class of org.apache.drill.exec.vector.NullableBigIntVector but was holding vector class org.apache.drill.exec.vector.IntVector, field= [`o_custkey` (INT:REQUIRED)] - //If you encounter this error, run the ANALYZE TABLE statement on each file with null values individually instead of running the statement against all the files at once. + //If you encounter this error, run the ANALYZE TABLE COMPUTE STATISTICS statement on each file with null values individually instead of running the statement against all the files at once. -- Running the ANALYZE TABLE statement creates the stats file, which changes the directory timestamp. The change of the timestamp automatically triggers the REFRESH TABLE METADATA command, even when the underlying data has not changed. +- Running the ANALYZE TABLE COMPUTE STATISTICS statement creates the stats file, which changes the directory timestamp. The change of the timestamp automatically triggers the REFRESH TABLE METADATA command, even when the underlying data has not changed. ## EXAMPLES @@ -201,7 +201,7 @@ The following query shows the columns and types of data in the “customer” ta ###Enabling Statistics for Query Planning -You can run the ANALYZE TABLE statement at any time to compute statistics; however, you must enable the following option if you want Drill to use statistics during query planning: +You can run the ANALYZE TABLE COMPUTE STATISTICS statement at any time to compute statistics; however, you must enable the following option if you want Drill to use statistics during query planning: set `planner.statistics.use`=true; +------+---------------------------------+ @@ -213,7 +213,7 @@ You can run the ANALYZE TABLE statement at any time to compute statistics; howev ###Computing Statistics You can compute statistics on directories with Parquet data or on Parquet tables. -You can run the ANALYZE TABLE statement on a subset of columns to generate statistics for those columns only, as shown: +You can run the ANALYZE TABLE COMPUTE STATISTICS statement on a subset of columns to generate statistics for those columns only, as shown: analyze table `table_stats/Tpch0.01/parquet/customer` compute statistics (c_custkey, c_nationkey, c_acctbal); +----------+---------------------------+ @@ -222,7 +222,7 @@ You can run the ANALYZE TABLE statement on a subset of columns to generate stati | 0_0 | 3 | +----------+---------------------------+ -Or, you can run the ANALYZE TABLE statement on the entire table/directory if you want statistics generated for all the columns: +Or, you can run the ANALYZE TABLE COMPUTE STATISTICS statement on the entire table/directory if you want statistics generated for all the columns: analyze table `table_stats/Tpch0.01/parquet/customer` compute statistics; +----------+---------------------------+ @@ -234,7 +234,7 @@ Or, you can run the ANALYZE TABLE statement on the entire table/directory if you ###Computing Statistics on a SAMPLE -You can also run ANALYZE TABLE on a percentage of the data using the SAMPLE command, as shown: +You can also run ANALYZE TABLE COMPUTE STATISTICS on a percentage of the data using the SAMPLE command, as shown: ANALYZE TABLE `table_stats/Tpch0.01/parquet/customer` COMPUTE STATISTICS SAMPLE 50 PERCENT; +----------+---------------------------+ @@ -247,9 +247,9 @@ You can also run ANALYZE TABLE on a percentage of the data using the SAMPLE comm ###Storing Statistics When you generate statistics, a statistics directory (`.stats.drill`) is created with a JSON file that contains the statistical data. -For tables, the `.stats.drill` directory is nested within the table directory. For example, if you ran ANALYZE TABLE against a table named “customer,” you could access the statistic file in `/customer/.stats.drill`. The JSON file is stored in the `.stats.drill` directory. +For tables, the `.stats.drill` directory is nested within the table directory. For example, if you ran ANALYZE TABLE COMPUTE STATISTICS against a table named “customer,” you could access the statistic file in `/customer/.stats.drill`. The JSON file is stored in the `.stats.drill` directory. -For directories, a new directory is written with the same name as the directory on which you ran ANALYZE TABLE, appended by `.stats.drill`. For example, if you ran ANALYZE TABLE against a directory named “customer,” you could access the JSON statistics file in the new `customer.stats.drill` directory. +For directories, a new directory is written with the same name as the directory on which you ran ANALYZE TABLE COMPUTE STATISTICS, appended by `.stats.drill`. For example, if you ran ANALYZE TABLE COMPUTE STATISTICS against a directory named “customer,” you could access the JSON statistics file in the new `customer.stats.drill` directory. You can query the statistics file to see the statistics generated for each column, as shown in the following two examples: @@ -272,7 +272,7 @@ You can query the statistics file to see the statistics generated for each colum ###Dropping Statistics -If you want to compute statistics on a table or directory that you have already run the ANALYZE TABLE statement against, you must first drop the statistics before you can run ANALYZE TABLE statement on the table again. +If you want to compute statistics on a table or directory that you have already run the ANALYZE TABLE COMPUTE STATISTICS statement against, you must first drop the statistics before you can run ANALYZE TABLE statement on the table again. The following example demonstrates how to drop statistics on a table: @@ -303,9 +303,9 @@ When you drop statistics, the statistics directory no longer exists for the tabl ## Troubleshooting -Typical errors you may get when running ANALYZE TABLE result from running the statement against an individual file or against a data source other than Parquet, as shown in the following examples: +Typical errors you may get when running ANALYZE TABLE COMPUTE STATISTICS result from running the statement against an individual file or against a data source other than Parquet, as shown in the following examples: -**Running ANALYZE TABLE on a file.** +**Running ANALYZE TABLE COMPUTE STATISTICS on a file.** ANALYZE TABLE `/parquet/nation.parquet` COMPUTE STATISTICS; +--------+----------------------------------------------------------------------------------+ @@ -315,7 +315,7 @@ Typical errors you may get when running ANALYZE TABLE result from running the st +--------+----------------------------------------------------------------------------------+ -**Running ANALYZE TABLE on a data source other than Parquet.** +**Running ANALYZE TABLE COMPUTE STATISTICS on a data source other than Parquet.** ANALYZE TABLE nation1_json COMPUTE STATISTICS; +--------+----------------------------------------------------------------------------------+ @@ -323,18 +323,3 @@ Typical errors you may get when running ANALYZE TABLE result from running the st +--------+----------------------------------------------------------------------------------+ | false | Table nation1_json is not supported by ANALYZE. Support is currently limited to directory-based Parquet tables. | +--------+----------------------------------------------------------------------------------+ - - - - - - - - - - - - - - -