[impala] 03/04: IMPALA-10393: Iceberg field id-based column resolution fails in ASAN builds
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit d1f4a4cc457c0327cf0e5d482116b97431da154e Author: Zoltan Borok-Nagy AuthorDate: Mon Dec 14 15:20:11 2020 +0100 IMPALA-10393: Iceberg field id-based column resolution fails in ASAN builds For MAP types field id resolution indexes the top-level columns via the current 'table_idx - 1'. In this case table_idx is either SchemaPathConstants::MAP_KEY or SchemaPathConstants::MAP_VALUE which are 0 and 1 respectively. Hence 'table_idx - 1' can be -1 which is not a valid index for a vector, hence we get an ASAN error. Even if 'table_idx - 1' is zero we get a wrong field id. Note that at this point in the schema resolution we have successfully found a MAP type with a matching field id, therefore it is safe to resolve the child via the value of 'table_idx' (which is the position of the child, MAP_KEY or MAP_VALUE). Testing: * Built impala with ASAN (buildall.sh -notests -skiptests -asan), then executed test_iceberg_query Change-Id: I41e8daaebe8a6024716e6c22f6ccd819f43508bd Reviewed-on: http://gerrit.cloudera.org:8080/16873 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/parquet-metadata-utils.cc | 14 +++--- tests/query_test/test_scanners.py | 2 -- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index 2e77326..e2b1e46 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++ b/be/src/exec/parquet/parquet-metadata-utils.cc @@ -764,17 +764,9 @@ SchemaNode* ParquetSchemaResolver::NextSchemaNode( DCHECK_EQ(col_type->type, TYPE_MAP); DCHECK(table_idx == SchemaPathConstants::MAP_KEY || table_idx == SchemaPathConstants::MAP_VALUE); - int field_id = -1; - if (table_idx == SchemaPathConstants::MAP_KEY) { -field_id = tbl_desc_.col_descs()[table_idx - 1].field_map_key_id(); - } else { -field_id = tbl_desc_.col_descs()[table_idx - 1].field_map_value_id(); - } - file_idx = FindChildWithFieldId(node, field_id); - if (file_idx >= node->children.size()) { -// Couldn't resolve by field id, fall back to resolution by position. -file_idx = table_idx; - } + // At this point we've found a MAP with a matching field id. It's safe to resolve + // the child (key or value) by position. + file_idx = table_idx; } } else { // Resolution by position. diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py index 3192144..5a865cc 100644 --- a/tests/query_test/test_scanners.py +++ b/tests/query_test/test_scanners.py @@ -348,8 +348,6 @@ class TestIceberg(ImpalaTestSuite): create_exec_option_dimension(debug_action_options=DEBUG_ACTION_DIMS)) cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format == 'parquet') -cls.ImpalaTestMatrix.add_dimension( -ImpalaTestDimension('PARQUET_FALLBACK_SCHEMA_RESOLUTION', 2)) def test_iceberg_query(self, vector): self.run_test_case('QueryTest/iceberg-query', vector)
[impala] branch master updated (87b95a5 -> a7e71b4)
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a change to branch master in repository https://gitbox.apache.org/repos/asf/impala.git. from 87b95a5 IMPALA-10386: Don't allow PARTITION BY SPEC for non-Iceberg tables new a8ac9f8 IMPALA-10390: impala-profile-tool JSON output new 9dd0abb IMPALA-10287: Include parallelism in cost comparison of broadcast vs partition new d1f4a4c IMPALA-10393: Iceberg field id-based column resolution fails in ASAN builds new a7e71b4 IMPALA-10358: Correct Iceberg type mappings The 4 revisions listed above as "new" are entirely new to this repository and will be described in separate emails. The revisions listed as "add" were already present in the repository and have only been added to this reference. Summary of changes: be/src/exec/parquet/hdfs-parquet-table-writer.cc | 8 +- be/src/exec/parquet/hdfs-parquet-table-writer.h| 6 + be/src/exec/parquet/parquet-metadata-utils.cc | 18 +- be/src/exec/parquet/parquet-metadata-utils.h | 4 +- be/src/service/impala-hs2-server.cc| 14 +- be/src/service/impala-server.h | 5 - be/src/service/query-options.cc| 16 + be/src/service/query-options.h | 6 +- be/src/util/impala-profile-tool.cc | 79 ++- be/src/util/runtime-profile.cc | 18 + be/src/util/runtime-profile.h | 6 + common/thrift/ImpalaInternalService.thrift | 7 + common/thrift/ImpalaService.thrift | 11 + .../org/apache/impala/analysis/InsertStmt.java | 17 + .../org/apache/impala/catalog/FeIcebergTable.java | 5 + .../org/apache/impala/catalog/IcebergTable.java| 13 +- .../impala/catalog/local/LocalIcebergTable.java| 9 +- .../apache/impala/planner/DistributedPlanner.java | 21 +- .../impala/service/IcebergCatalogOpExecutor.java | 120 + .../apache/impala/util/IcebergSchemaConverter.java | 183 +++ .../java/org/apache/impala/util/IcebergUtil.java | 127 - .../org/apache/impala/planner/PlannerTest.java | 8 + .../queries/PlannerTest/tpcds-dist-method.test | 538 + .../queries/QueryTest/iceberg-insert.test | 23 - .../queries/QueryTest/iceberg-negative.test| 21 + tests/query_test/test_iceberg.py | 39 +- tests/query_test/test_scanners.py | 2 - 27 files changed, 999 insertions(+), 325 deletions(-) create mode 100644 fe/src/main/java/org/apache/impala/util/IcebergSchemaConverter.java create mode 100644 testdata/workloads/functional-planner/queries/PlannerTest/tpcds-dist-method.test
[impala] 02/04: IMPALA-10287: Include parallelism in cost comparison of broadcast vs partition
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit 9dd0abbb373c0256a948cdf54a1a677230e7f5cb Author: Aman Sinha AuthorDate: Thu Dec 10 23:20:05 2020 -0800 IMPALA-10287: Include parallelism in cost comparison of broadcast vs partition The current planner tends to pick broadcast distribution in some cases even when partition distribution would be more optimal (seen in TPC-DS performance runs). This patch adds 2 query options: - use_dop_for_costing (type:boolean, default:true) - broadcast_to_partition_factor (type:double, default:1.0) With use_dop_for_costing enabled, the distributed planner will increase the cost of the broadcast join's build side by C.sqrt(m) where m = degree of parallelism of the join node and, C = the broadcast_to_partition_factor This allows the planner to more favorably consider partition distribution where appropriate. The choice of sqrt in the calculation is not a final choice at this point but is intended to model a non-linear relationship between mt_dop and the query performance. After further performance testing with tuning the above factor, we can establish a better correlation and refine the formula (tracked by IMPALA-10395). Testing: - Added a new test file with TPC-DS Q78 which shows partition distribution for a left-outer join (with store_returns on the right input) in the query when the query options are enabled (it chooses broadcast otherwise). - Ran PlannerTest and TpcdsPlannerTest. - Ran e2e tests for Tpcds and Tpch. Change-Id: Idff569299e5c78720ca17c616a531adac78208e1 Reviewed-on: http://gerrit.cloudera.org:8080/16864 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/query-options.cc| 16 + be/src/service/query-options.h | 6 +- common/thrift/ImpalaInternalService.thrift | 7 + common/thrift/ImpalaService.thrift | 11 + .../apache/impala/planner/DistributedPlanner.java | 21 +- .../org/apache/impala/planner/PlannerTest.java | 8 + .../queries/PlannerTest/tpcds-dist-method.test | 538 + 7 files changed, 603 insertions(+), 4 deletions(-) diff --git a/be/src/service/query-options.cc b/be/src/service/query-options.cc index f2cd720..cc65f08 100644 --- a/be/src/service/query-options.cc +++ b/be/src/service/query-options.cc @@ -988,6 +988,22 @@ Status impala::SetQueryOption(const string& key, const string& value, query_options->__set_report_skew_limit(skew_threshold); break; } + case TImpalaQueryOptions::USE_DOP_FOR_COSTING: { +query_options->__set_use_dop_for_costing(IsTrue(value)); +break; + } + case TImpalaQueryOptions::BROADCAST_TO_PARTITION_FACTOR: { +StringParser::ParseResult result; +const double val = +StringParser::StringToFloat(value.c_str(), value.length(), ); +if (result != StringParser::PARSE_SUCCESS || val < 0 || val > 1000) { + return Status(Substitute("Invalid broadcast to partition factor '$0'. " + "Only values from 0 to 1000 are allowed.", + value)); +} +query_options->__set_broadcast_to_partition_factor(val); +break; + } default: if (IsRemovedQueryOption(key)) { LOG(WARNING) << "Ignoring attempt to set removed query option '" << key << "'"; diff --git a/be/src/service/query-options.h b/be/src/service/query-options.h index d61e47d..9abd042 100644 --- a/be/src/service/query-options.h +++ b/be/src/service/query-options.h @@ -47,7 +47,7 @@ typedef std::unordered_map // time we add or remove a query option to/from the enum TImpalaQueryOptions. #define QUERY_OPTS_TABLE\ DCHECK_EQ(_TImpalaQueryOptions_VALUES_TO_NAMES.size(),\ - TImpalaQueryOptions::OPTIMIZE_SIMPLE_LIMIT + 1);\ + TImpalaQueryOptions::BROADCAST_TO_PARTITION_FACTOR + 1);\ REMOVED_QUERY_OPT_FN(abort_on_default_limit_exceeded, ABORT_ON_DEFAULT_LIMIT_EXCEEDED)\ QUERY_OPT_FN(abort_on_error, ABORT_ON_ERROR, TQueryOptionLevel::REGULAR)\ REMOVED_QUERY_OPT_FN(allow_unsupported_formats, ALLOW_UNSUPPORTED_FORMATS)\ @@ -225,6 +225,10 @@ typedef std::unordered_map TQueryOptionLevel::ADVANCED)\ QUERY_OPT_FN(optimize_simple_limit, OPTIMIZE_SIMPLE_LIMIT,\ TQueryOptionLevel::REGULAR)\ + QUERY_OPT_FN(use_dop_for_costing, USE_DOP_FOR_COSTING,\ + TQueryOptionLevel::ADVANCED)\ + QUERY_OPT_FN(broadcast_to_partition_factor, BROADCAST_TO_PARTITION_FACTOR,\ + TQueryOptionLevel::ADVANCED)\ ; /// Enforce practical limits on some query options to avoid undesired query state. diff --git a/common/thrift/ImpalaInternalService.thrift
[impala] 01/04: IMPALA-10390: impala-profile-tool JSON output
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a8ac9f8a57730880520448df1c72a0d5b938fa7d Author: Tim Armstrong AuthorDate: Thu Dec 10 11:04:44 2020 -0800 IMPALA-10390: impala-profile-tool JSON output Add --profile_format option that takes options "text", "json" or "prettyjson". "json" and "prettyjson" output the JSON representation of each profile in a dense single-line form and in a human-readable multi-line form respectively. Also implement usage output when --help is passed in. Change-Id: I82ae0fe9379b7e3cbe93166adaa4c37212ea0f67 Reviewed-on: http://gerrit.cloudera.org:8080/16855 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/service/impala-hs2-server.cc | 14 +-- be/src/service/impala-server.h | 5 --- be/src/util/impala-profile-tool.cc | 79 + be/src/util/runtime-profile.cc | 18 + be/src/util/runtime-profile.h | 6 +++ 5 files changed, 89 insertions(+), 33 deletions(-) diff --git a/be/src/service/impala-hs2-server.cc b/be/src/service/impala-hs2-server.cc index 5c618fb..9cd55bb 100644 --- a/be/src/service/impala-hs2-server.cc +++ b/be/src/service/impala-hs2-server.cc @@ -32,8 +32,6 @@ #include #include #include -#include -#include #include #include "common/logging.h" @@ -1030,15 +1028,6 @@ void ImpalaServer::GetExecSummary(TGetExecSummaryResp& return_val, return_val.status.__set_statusCode(thrift::TStatusCode::SUCCESS_STATUS); } -void ImpalaServer::JsonProfileToStringProfile( -const rapidjson::Document& json_profile, stringstream* string_profile) { - // Serialize to JSON without extra whitespace/formatting. - rapidjson::StringBuffer sb; - rapidjson::Writer writer(sb); - json_profile.Accept(writer); - *string_profile << sb.GetString(); -} - // Add the given Thrift profile to the list of failed thrift profiles for the given // TGetRuntimeProfileResp. void SetFailedProfile( @@ -1069,7 +1058,8 @@ void ImpalaServer::SetProfile(TGetRuntimeProfileResp& get_profile_resp, } } else if (profile_format == TRuntimeProfileFormat::JSON) { DCHECK(profile.json_output != nullptr); -JsonProfileToStringProfile(*profile.json_output, profile.string_output); +RuntimeProfile::JsonProfileToString( +*profile.json_output, /*pretty=*/false, profile.string_output); if (set_failed_profile) { SetFailedProfile(profile.string_output, get_profile_resp); } else { diff --git a/be/src/service/impala-server.h b/be/src/service/impala-server.h index 2347030..4971dfe 100644 --- a/be/src/service/impala-server.h +++ b/be/src/service/impala-server.h @@ -769,11 +769,6 @@ class ImpalaServer : public ImpalaServiceIf, Status GetRuntimeProfileOutput(const string& user, const QueryHandle& query_handle, TRuntimeProfileFormat::type format, RuntimeProfileOutput* profile); - /// Converts a JSON Document representation of a profile to a string representation. - /// Both parameters cannot be nullptr. - void JsonProfileToStringProfile(const rapidjson::Document& json_profile, - std::stringstream* string_profile); - /// Set the profile (or thrift_profile) field for the given TRuntimeProfileFormat /// using the profile from the given RuntimeProfileOutput. If 'set_failed_profile' /// is true, then the profile is added to the 'failed_profile' field of diff --git a/be/src/util/impala-profile-tool.cc b/be/src/util/impala-profile-tool.cc index 9fe6f6f..5618978 100644 --- a/be/src/util/impala-profile-tool.cc +++ b/be/src/util/impala-profile-tool.cc @@ -19,6 +19,8 @@ #include #include #include + +#include #include #include "common/object-pool.h" @@ -26,36 +28,64 @@ #include "common/names.h" -// Utility to decode an Impala profile log from standard input. -// The profile log is consumed from standard input and each successfully parsed entry -// is pretty-printed to standard output. -// -// Example usage: -// impala-profile-tool < impala_profile_log_1.1-1607057366897 -// -// The following options are supported: -// --query_id=: given an impala query ID, only process profiles with this -// query id -// --min_timestamp=: only process profiles at or after this timestamp -// --max_timestamp=: only process profiles at or before this timestamp -// -// --gen_experimental_profile: if set to true, generates full output for the new -// experimental profile. +static const char* USAGE = +"Utility to decode an Impala profile log from standard input.\n" +"\n" +"The profile log is consumed from standard input and each successfully parsed entry" +" is pretty-printed to standard output.\n" +"\n" +"Usage:" +" impala-profile-tool < impala_profile_log_1.1-1607057366897\n" +"\n" +"The following options are
[impala] 04/04: IMPALA-10358: Correct Iceberg type mappings
This is an automated email from the ASF dual-hosted git repository. tarmstrong pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git commit a7e71b45232c90af09ff70a7596db56688cfeb31 Author: Zoltan Borok-Nagy AuthorDate: Thu Dec 10 17:41:39 2020 +0100 IMPALA-10358: Correct Iceberg type mappings The Iceberg format spec defines what types to use for different file formats, e.g.: https://iceberg.apache.org/spec/#parquet Impala should follow the specification, so this patch * annotates strings with UTF8 in Parquet metadata * removes fixed(L) <-> CHAR(L) mapping * forbids INSERTs when the Iceberg schema has a TIMESTAMPTZ column This patch also refactors the type/schema conversions as Impala => Iceberg conversions were duplicated in IcebergCatalogOpExecutor and IcebergUtil. I introduced the class 'IcebergSchemaConverter' to contain the code for conversions. Testing: * added test to check CHAR and VARCHAR types are not allowed * test that INSERTs are not allowed when the table has TIMESTMAPTZ * added test to check that strings are annotated with UTF8 Change-Id: I652565f82708824f5cf7497139153b06f116ccd3 Reviewed-on: http://gerrit.cloudera.org:8080/16851 Reviewed-by: Impala Public Jenkins Tested-by: Impala Public Jenkins --- be/src/exec/parquet/hdfs-parquet-table-writer.cc | 8 +- be/src/exec/parquet/hdfs-parquet-table-writer.h| 6 + be/src/exec/parquet/parquet-metadata-utils.cc | 4 +- be/src/exec/parquet/parquet-metadata-utils.h | 4 +- .../org/apache/impala/analysis/InsertStmt.java | 17 ++ .../org/apache/impala/catalog/FeIcebergTable.java | 5 + .../org/apache/impala/catalog/IcebergTable.java| 13 +- .../impala/catalog/local/LocalIcebergTable.java| 9 +- .../impala/service/IcebergCatalogOpExecutor.java | 120 +- .../apache/impala/util/IcebergSchemaConverter.java | 183 + .../java/org/apache/impala/util/IcebergUtil.java | 127 -- .../queries/QueryTest/iceberg-insert.test | 23 --- .../queries/QueryTest/iceberg-negative.test| 21 +++ tests/query_test/test_iceberg.py | 39 - 14 files changed, 304 insertions(+), 275 deletions(-) diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.cc b/be/src/exec/parquet/hdfs-parquet-table-writer.cc index 5203c3f..f835fe4 100644 --- a/be/src/exec/parquet/hdfs-parquet-table-writer.cc +++ b/be/src/exec/parquet/hdfs-parquet-table-writer.cc @@ -997,6 +997,11 @@ void HdfsParquetTableWriter::ConfigureTimestampType() { timestamp_type_ = state_->query_options().parquet_timestamp_type; } +void HdfsParquetTableWriter::ConfigureStringType() { + string_utf8_ = is_iceberg_file_ || + state_->query_options().parquet_annotate_strings_utf8; +} + Status HdfsParquetTableWriter::Init() { // Initialize file metadata file_metadata_.version = PARQUET_CURRENT_VERSION; @@ -1062,6 +1067,7 @@ Status HdfsParquetTableWriter::Init() { Codec::CodecInfo codec_info(codec, clevel); ConfigureTimestampType(); + ConfigureStringType(); columns_.resize(num_cols); // Initialize each column structure. @@ -1178,7 +1184,7 @@ Status HdfsParquetTableWriter::CreateSchema() { DCHECK_EQ(col_desc.name(), columns_[i]->column_name()); const int field_id = col_desc.field_id(); if (field_id != -1) col_schema.__set_field_id(field_id); -ParquetMetadataUtils::FillSchemaElement(col_type, state_->query_options(), +ParquetMetadataUtils::FillSchemaElement(col_type, string_utf8_, timestamp_type_, _schema); } diff --git a/be/src/exec/parquet/hdfs-parquet-table-writer.h b/be/src/exec/parquet/hdfs-parquet-table-writer.h index 672fa33..aadad1f 100644 --- a/be/src/exec/parquet/hdfs-parquet-table-writer.h +++ b/be/src/exec/parquet/hdfs-parquet-table-writer.h @@ -160,6 +160,9 @@ class HdfsParquetTableWriter : public HdfsTableWriter { /// Selects the Parquet timestamp type to be used by this writer. void ConfigureTimestampType(); + /// Sets 'string_utf8_' based on query options and table type. + void ConfigureStringType(); + /// Updates output partition with some summary about the written file. void FinalizePartitionInfo(); @@ -225,6 +228,9 @@ class HdfsParquetTableWriter : public HdfsTableWriter { /// True if we are writing an Iceberg data file. In that case the writer behaves a /// bit differently, e.g. writes specific type of timestamps, fills some extra metadata. bool is_iceberg_file_ = false; + + /// If true, STRING values are annotated with UTF8 in Parquet metadata. + bool string_utf8_ = false; }; } diff --git a/be/src/exec/parquet/parquet-metadata-utils.cc b/be/src/exec/parquet/parquet-metadata-utils.cc index e2b1e46..2caa949 100644 --- a/be/src/exec/parquet/parquet-metadata-utils.cc +++